From f3a4e18d0236a1bf2819c981f40737cf78500599 Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadribasak@google.com>
Date: Mon, 3 Nov 2025 13:11:00 +0000
Subject: [PATCH 001/313] Adding implementation for missing origin statsitics
 on LifetimeSafetyAnalysis

---
 .../Analyses/LifetimeSafety/LifetimeSafety.h  | 21 +++++++++++++++++++
 .../Analyses/LifetimeSafety/Origins.h         |  6 ++++++
 .../LifetimeSafety/LifetimeSafety.cpp         |  5 +++++
 clang/lib/Analysis/LifetimeSafety/Origins.cpp | 16 ++++++++++++++
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  4 +++-
 5 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 91ffbb169f947..4952d84a80369 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -23,7 +23,11 @@
 #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
 
 namespace clang::lifetimes {
 
@@ -60,6 +64,9 @@ struct LifetimeFactory {
 /// Running the lifetime safety analysis and querying its results. It
 /// encapsulates the various dataflow analyses.
 class LifetimeSafetyAnalysis {
+private:
+  static llvm::StringMap<int> MissingOriginCount;
+
 public:
   LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                          LifetimeSafetyReporter *Reporter);
@@ -73,6 +80,20 @@ class LifetimeSafetyAnalysis {
   LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; }
   FactManager &getFactManager() { return FactMgr; }
 
+  static void PrintStats(llvm::raw_ostream &OS) {
+    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
+                    "(expression_type : count) :\n";
+    for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) {
+      OS << expr << " : " << count << '\n';
+    }
+  }
+
+  static void UpdateMissingOriginCount(const OriginManager &OM) {
+    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
+      LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count;
+    }
+  }
+
 private:
   AnalysisDeclContext &AC;
   LifetimeSafetyReporter *Reporter;
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index ba138b078b379..231cc60b7e097 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -16,7 +16,10 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::lifetimes::internal {
 
@@ -76,6 +79,8 @@ class OriginManager {
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
+  const llvm::StringMap<int> getMissingOrigins() const;
+
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
 
@@ -85,6 +90,7 @@ class OriginManager {
   llvm::SmallVector<Origin> AllOrigins;
   llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID;
   llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID;
+  llvm::StringMap<int> ExprTypeToMissingOriginCount;
 };
 } // namespace clang::lifetimes::internal
 
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 00c7ed90503e7..a76fdd2535d97 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -23,14 +23,18 @@
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace clang::lifetimes {
 namespace internal {
 
+llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount;
+
 LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
@@ -66,6 +70,7 @@ void LifetimeSafetyAnalysis::run() {
                   LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints()));
 
   runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter);
+  UpdateMissingOriginCount(FactMgr.getOriginMgr());
 }
 } // namespace internal
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index ea51a75324e06..abe067a829cb7 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/AST/TypeBase.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace clang::lifetimes::internal {
 
@@ -22,6 +24,10 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const {
   OS << ")";
 }
 
+const llvm::StringMap<int> OriginManager::getMissingOrigins() const {
+  return ExprTypeToMissingOriginCount;
+}
+
 Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) {
   AllOrigins.emplace_back(ID, &D);
   return AllOrigins.back();
@@ -37,6 +43,16 @@ OriginID OriginManager::get(const Expr &E) {
   auto It = ExprToOriginID.find(&E);
   if (It != ExprToOriginID.end())
     return It->second;
+
+  // if the expression has no specific origin, increment the missing origin
+  // counter.
+  const QualType ExprType = E.getType();
+  auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString());
+  if (CountIt == ExprTypeToMissingOriginCount.end()) {
+    ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1;
+  } else {
+    CountIt->second++;
+  }
   // If the expression itself has no specific origin, and it's a reference
   // to a declaration, its origin is that of the declaration it refers to.
   // For pointer types, where we don't pre-emptively create an origin for the
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 140b709dbb651..009994e189220 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -30,6 +30,7 @@
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/UninitializedValues.h"
@@ -53,6 +54,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <deque>
 #include <iterator>
@@ -3132,8 +3134,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
 }
 
 void clang::sema::AnalysisBasedWarnings::PrintStats() const {
+  clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs());
   llvm::errs() << "\n*** Analysis Based Warnings Stats:\n";
-
   unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs;
   unsigned AvgCFGBlocksPerFunction =
       !NumCFGsBuilt ? 0 : NumCFGBlocks/NumCFGsBuilt;

From 2ea1edbeea095b6baec25233c5076de6f5463e8b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 3 Nov 2025 13:22:03 +0000
Subject: [PATCH 002/313] [gn build] Port de2797c888e0

---
 .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn  | 1 +
 .../gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn      | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index f280f695cd3ab..2f84999621e1b 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -32,6 +32,7 @@ static_library("bugprone") {
     "CopyConstructorInitCheck.cpp",
     "CrtpConstructorAccessibilityCheck.cpp",
     "DanglingHandleCheck.cpp",
+    "DefaultOperatorNewOnOveralignedTypeCheck.cpp",
     "DerivedMethodShadowingBaseMethodCheck.cpp",
     "DynamicStaticInitializersCheck.cpp",
     "EasilySwappableParametersCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index 3ad0a83a8fb23..ec642b6afad66 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,7 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "DefaultOperatorNewAlignmentCheck.cpp",
     "DontModifyStdNamespaceCheck.cpp",
     "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",

From fa85d2da89acec96c6c31f3d0e2dd009b6286054 Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadribasak@google.com>
Date: Mon, 3 Nov 2025 13:23:42 +0000
Subject: [PATCH 003/313] Minor refactoring of the static functions

---
 .../Analyses/LifetimeSafety/LifetimeSafety.h       | 14 ++------------
 .../lib/Analysis/LifetimeSafety/LifetimeSafety.cpp | 13 +++++++++++++
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 4952d84a80369..7490df90a3282 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -80,19 +80,9 @@ class LifetimeSafetyAnalysis {
   LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; }
   FactManager &getFactManager() { return FactMgr; }
 
-  static void PrintStats(llvm::raw_ostream &OS) {
-    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
-                    "(expression_type : count) :\n";
-    for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) {
-      OS << expr << " : " << count << '\n';
-    }
-  }
+  static void PrintStats(llvm::raw_ostream &OS);
 
-  static void UpdateMissingOriginCount(const OriginManager &OM) {
-    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
-      LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count;
-    }
-  }
+  static void UpdateMissingOriginCount(const OriginManager &OM);
 
 private:
   AnalysisDeclContext &AC;
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index a76fdd2535d97..828c08d1cbeed 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -39,6 +39,19 @@ LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
 
+void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) {
+    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
+                    "(expression_type : count) :\n";
+    for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) {
+      OS << expr << " : " << count << '\n';
+    }
+  }
+
+void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) {
+    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
+      LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count;
+    }
+  }
 void LifetimeSafetyAnalysis::run() {
   llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
 

From ecc70fdbe36b0b8994a57f8c213c2f2ce651563d Mon Sep 17 00:00:00 2001
From: Koakuma <koachan@protonmail.com>
Date: Mon, 3 Nov 2025 20:31:39 +0700
Subject: [PATCH 004/313] [llvm-lit] Add `sparc64` to the list of BE triple
 (#166113)

Linux uses `sparc64` triple name on 64-bit SPARCs.
This should fix the test failure in discriminated-union.ll.
---
 llvm/test/lit.cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 11a5a5785a6ec..974af4b571503 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -474,7 +474,7 @@ def enable_ptxas(ptxas_executable):
 config.available_features.add("host-byteorder-" + sys.byteorder + "-endian")
 if config.target_triple:
     if re.match(
-        r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*",
+        r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*",
         config.target_triple,
     ):
         config.available_features.add("target-byteorder-big-endian")

From 8bd7fc7211782e7b2ba76070ee6af46f4913a185 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Mon, 3 Nov 2025 21:36:10 +0800
Subject: [PATCH 005/313] [ExprMutation] fix false postives on
 pointer-to-member operator (#166069)

Fixed: #161913

---------

Co-authored-by: Baranov Victor <bar.victor.2002@gmail.com>
---
 clang-tools-extra/docs/ReleaseNotes.rst           |  3 ++-
 clang/lib/Analysis/ExprMutationAnalyzer.cpp       | 13 ++++++++-----
 .../Analysis/ExprMutationAnalyzerTest.cpp         | 15 +++++++++++++++
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 35a64395f04e9..26f00e9a8a294 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -379,7 +379,8 @@ Changes in existing checks
   <clang-tidy/checks/misc/const-correctness>` check to avoid false
   positives when pointers is transferred to non-const references
   and avoid false positives of function pointer and fix false
-  positives on return of non-const pointer.
+  positives on return of non-const pointer and fix false positives on
+  pointer-to-member operator.
 
 - Improved :doc:`misc-header-include-cycle
   <clang-tidy/checks/misc/header-include-cycle>` check performance.
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 75b17c545bb78..54c30c05c3e19 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -746,11 +746,14 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) {
                     Stm, Context));
   if (MemberCallExpr)
     return MemberCallExpr;
-  const auto Matches =
-      match(stmt(forEachDescendant(
-                memberExpr(hasObjectExpression(canResolveToExprPointee(Exp)))
-                    .bind(NodeID<Expr>::value))),
-            Stm, Context);
+  const auto Matches = match(
+      stmt(forEachDescendant(
+          expr(anyOf(memberExpr(
+                         hasObjectExpression(canResolveToExprPointee(Exp))),
+                     binaryOperator(hasOperatorName("->*"),
+                                    hasLHS(canResolveToExprPointee(Exp)))))
+              .bind(NodeID<Expr>::value))),
+      Stm, Context);
   return findExprMutation(Matches);
 }
 
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index ef229606de0f0..8fc9a66dbda7e 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -2076,4 +2076,19 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) {
   }
 }
 
+TEST(ExprMutationAnalyzerTest, PointeeMutatedByPointerToMemberOperator) {
+  // GH161913
+  const std::string Code = R"(
+    struct S { int i; };
+    void f(S s) {
+      S *x = &s;
+      (x->*(&S::i))++;
+    }
+  )";
+  auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+  auto Results =
+      match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+}
+
 } // namespace clang

From 795fa9ea00a18b2ccd779efb17ce805a73d1ce7a Mon Sep 17 00:00:00 2001
From: Shikhar Jain <shikharj@qti.qualcomm.com>
Date: Mon, 3 Nov 2025 19:06:37 +0530
Subject: [PATCH 006/313]  Bound ISL operations during pre-vectorization
 (#165204)

Bound ISL operations during pre-vectorization to
prevent indefinite compilation. The MaxOpGuard
previously used for schedule computation is now
extended to also guard pre-vectorization optimizations.
This patch includes a reduced test case derived
from the original bug report.

---------

Co-authored-by: Michael Kruse <llvm-project@meinersbur.de>
---
 polly/lib/Transform/ScheduleOptimizer.cpp     | 53 +++++++++++++------
 polly/lib/Transform/ScheduleTreeTransform.cpp |  5 ++
 .../prevectorization_islbound.ll              | 37 +++++++++++++
 3 files changed, 78 insertions(+), 17 deletions(-)
 create mode 100644 polly/test/ScheduleOptimizer/prevectorization_islbound.ll

diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 070700a64a168..0888ebd7a9362 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -237,6 +237,7 @@ struct OptimizerAdditionalInfoTy {
   bool Postopts;
   bool Prevect;
   bool &DepsChanged;
+  IslMaxOperationsGuard &MaxOpGuard;
 };
 
 class ScheduleTreeOptimizer final {
@@ -381,6 +382,8 @@ class ScheduleTreeOptimizer final {
 isl::schedule_node
 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
                                                int VectorWidth) {
+  if (Node.is_null())
+    return {};
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
   Node = Node.child(0).child(0);
   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
@@ -391,6 +394,8 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
   Node = Node.parent().parent();
   isl::union_set Options = IsolateOption.unite(AtomicOption);
+  if (Node.is_null())
+    return {};
   isl::schedule_node_band Result =
       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
   return Result;
@@ -411,9 +416,13 @@ struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
 
 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
+  if (Node.is_null())
+    return {};
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
 
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  if (Space.is_null())
+    return {};
   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
   assert(DimToVectorize < ScheduleDimensions);
 
@@ -439,12 +448,15 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
   // Sink the inner loop into the smallest possible statements to make them
   // represent a single vector instruction if possible.
   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
+  if (Node.is_null())
+    return {};
 
   // Add SIMD markers to those vector statements.
   InsertSimdMarkers SimdMarkerInserter;
   Node = SimdMarkerInserter.visit(Node);
 
-  PrevectOpts++;
+  if (!Node.is_null())
+    PrevectOpts++;
   return Node.parent();
 }
 
@@ -535,6 +547,8 @@ ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
 isl::schedule_node
 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  if (Space.is_null())
+    return {};
   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
 
   for (int i = Dims - 1; i >= 0; i--)
@@ -572,9 +586,14 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
     Node = applyTileBandOpt(Node);
 
   if (OAI->Prevect) {
+    IslQuotaScope MaxScope = OAI->MaxOpGuard.enter();
+
     // FIXME: Prevectorization requirements are different from those checked by
     // isTileableBandNode.
     Node = applyPrevectBandOpt(Node);
+
+    if (OAI->MaxOpGuard.hasQuotaExceeded() || Node.is_null())
+      return (isl::schedule_node()).release();
   }
 
   return Node.release();
@@ -771,6 +790,10 @@ static void runIslScheduleOptimizer(
     return;
   }
 
+  isl_ctx *Ctx = S.getIslCtx().get();
+  IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut,
+                                   /*AutoEnter=*/false);
+
   // Apply ISL's algorithm only if not overridden by the user. Note that
   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
   // rely on the coincidence/permutable annotations on schedule tree bands that
@@ -853,8 +876,6 @@ static void runIslScheduleOptimizer(
       IslOuterCoincidence = 0;
     }
 
-    isl_ctx *Ctx = S.getIslCtx().get();
-
     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
@@ -870,28 +891,20 @@ static void runIslScheduleOptimizer(
     SC = SC.set_coincidence(Validity);
 
     {
-      IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
+      IslQuotaScope MaxOpScope = MaxOpGuard.enter();
       Schedule = SC.compute_schedule();
-
-      if (MaxOpGuard.hasQuotaExceeded())
-        POLLY_DEBUG(
-            dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
     }
 
     isl_options_set_on_error(Ctx, OnErrorStatus);
 
-    ScopsRescheduled++;
+    if (!Schedule.is_null())
+      ScopsRescheduled++;
     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
   }
 
   walkScheduleTreeForStatistics(Schedule, 1);
 
-  // In cases the scheduler is not able to optimize the code, we just do not
-  // touch the schedule.
-  if (Schedule.is_null())
-    return;
-
-  if (GreedyFusion) {
+  if (GreedyFusion && !Schedule.is_null()) {
     isl::union_map Validity = D.getDependences(
         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
     Schedule = applyGreedyFusion(Schedule, Validity);
@@ -905,14 +918,20 @@ static void runIslScheduleOptimizer(
       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
       /*Postopts=*/!HasUserTransformation && EnablePostopts,
       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
-      DepsChanged};
-  if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
+      DepsChanged,
+      MaxOpGuard};
+  if (!Schedule.is_null() && (OAI.PatternOpts || OAI.Postopts || OAI.Prevect)) {
     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
     Schedule = hoistExtensionNodes(Schedule);
     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
     walkScheduleTreeForStatistics(Schedule, 2);
   }
 
+  if (MaxOpGuard.hasQuotaExceeded()) {
+    POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
+    return;
+  }
+
   // Skip profitability check if user transformation(s) have been applied.
   if (!HasUserTransformation &&
       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp
index 3f3630027e6e3..c95c55858f038 100644
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -972,6 +972,9 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) {
 }
 
 isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) {
+  if (Sched.is_null())
+    return {};
+
   // If there is no extension node in the first place, return the original
   // schedule tree.
   if (!containsExtensionNode(Sched))
@@ -1126,6 +1129,8 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
 
 isl::union_set polly::getIsolateOptions(isl::set IsolateDomain,
                                         unsigned OutDimsNum) {
+  if (IsolateDomain.is_null())
+    return {};
   unsigned Dims = unsignedFromIslSize(IsolateDomain.tuple_dim());
   assert(OutDimsNum <= Dims &&
          "The isl::set IsolateDomain is used to describe the range of schedule "
diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
new file mode 100644
index 0000000000000..0bc3c2cf642e8
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) {
+bb:
+  %getelementptr = getelementptr [7 x float], ptr null, i32 0, i32 %arg3
+  br label %bb7
+
+bb7:                                              ; preds = %bb11, %bb
+  %phi = phi i32 [ 0, %bb ], [ %add16, %bb11 ]
+  br label %bb8
+
+bb8:                                              ; preds = %bb8, %bb7
+  %phi9 = phi i32 [ 0, %bb7 ], [ %add, %bb8 ]
+  %getelementptr10 = getelementptr [7 x float], ptr null, i32 0, i32 %phi9
+  store float 0.000000e+00, ptr %getelementptr10, align 4
+  %add = add i32 %phi9, 1
+  %icmp = icmp eq i32 %phi9, 0
+  br i1 %icmp, label %bb8, label %bb11
+
+bb11:                                             ; preds = %bb8
+  %load = load float, ptr %getelementptr, align 4
+  store float %load, ptr %arg4, align 4
+  %getelementptr12 = getelementptr [7 x float], ptr null, i32 0, i32 %arg5
+  %load13 = load float, ptr %getelementptr12, align 4
+  store float %load13, ptr %arg, align 4
+  %getelementptr14 = getelementptr [7 x float], ptr null, i32 0, i32 %arg6
+  %load15 = load float, ptr %getelementptr14, align 4
+  store float %load15, ptr %arg1, align 4
+  %add16 = add i32 %phi, 1
+  %icmp17 = icmp ne i32 %phi, %arg2
+  br i1 %icmp17, label %bb7, label %bb18
+
+bb18:                                             ; preds = %bb11
+  ret void
+}
+; CHECK:Schedule optimizer calculation exceeds ISL quota

From 3d3fab17f5ea8a14eb390f53075c094f5e1f19fa Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 3 Nov 2025 07:37:13 -0600
Subject: [PATCH 007/313] [flang][OpenMP] Use OmpDirectiveSpecification in
 ALLOCATE (#165865)

The ALLOCATE directive has two forms:
- A declarative form with a standalone directive:
  ```
  !$OMP ALLOCATE (variable-list-item...)
  ```
- An executable form that consists of several directives followed by an
ALLOCATE statement:
  ```
  !$OMP ALLOCATE (variable-list-item...)
  !$OMP ALLOCATE (variable-list-item...)
  ...
  ALLOCATE (...)
  ```

The second form was deprecated in OpenMP 5.2 in favor of the ALLOCATORS
construct.

Since in the parse tree every type corresponding to a directive only
corresponds to a single directive, the executable form is represented by
a sequence of nested OmpAllocateDirectives, e.g.
```
   !$OMP ALLOCATE(x)
   !$OMP ALLOCATE(y)
   ALLOCATE(x, y)
```
will become
```
   OmpAllocateDirective
   |- ALLOCATE(x)            // begin directive
   `- OmpAllocateDirective   // block
      |- ALLOCATE(y)            // begin directive
      `- ALLOCATE(x, y)         // block
```
With this change all AST nodes for directives use
OmpDirectiveSpecification as the directive representation.
---
 flang/examples/FeatureList/FeatureList.cpp    |   3 +-
 flang/include/flang/Parser/dump-parse-tree.h  |   3 +-
 flang/include/flang/Parser/openmp-utils.h     |  31 +-
 flang/include/flang/Parser/parse-tree.h       |  56 ++--
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  18 +-
 flang/lib/Parser/openmp-parsers.cpp           |  46 +--
 flang/lib/Parser/openmp-utils.cpp             |  39 +++
 flang/lib/Parser/unparse.cpp                  |  26 +-
 flang/lib/Semantics/canonicalize-omp.cpp      | 156 ++++++++--
 flang/lib/Semantics/check-omp-structure.cpp   | 291 +++++++++---------
 flang/lib/Semantics/check-omp-structure.h     |  27 +-
 flang/lib/Semantics/resolve-directives.cpp    |  58 ++--
 flang/lib/Semantics/resolve-names.cpp         |   4 +-
 .../Todo/omp-declarative-allocate-align.f90   |   2 +-
 .../OpenMP/Todo/omp-declarative-allocate.f90  |   2 +-
 .../Parser/OpenMP/allocate-align-tree.f90     |  48 +--
 .../Parser/OpenMP/allocate-tree-spec-part.f90 |  63 ++--
 flang/test/Parser/OpenMP/allocate-tree.f90    |  80 ++---
 flang/test/Parser/OpenMP/allocate-unparse.f90 |  18 +-
 .../Semantics/OpenMP/allocate-align01.f90     |   2 +-
 .../Semantics/OpenMP/allocate-directive.f90   |   2 +-
 flang/test/Semantics/OpenMP/allocate01.f90    |   2 +-
 flang/test/Semantics/OpenMP/allocate02.f90    |   1 +
 flang/test/Semantics/OpenMP/allocate03.f90    |   1 +
 flang/test/Semantics/OpenMP/allocate06.f90    |   2 +-
 flang/test/Semantics/OpenMP/allocate10.f90    |   2 +-
 flang/test/Semantics/OpenMP/allocate12.f90    |  16 +
 27 files changed, 587 insertions(+), 412 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/allocate12.f90

diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 225a6558ef956..ef58da61e371b 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -445,6 +445,7 @@ struct NodeVisitor {
   READ_FEATURE(ObjectDecl)
   READ_FEATURE(OldParameterStmt)
   READ_FEATURE(OmpAlignedClause)
+  READ_FEATURE(OmpAllocateDirective)
   READ_FEATURE(OmpBeginDirective)
   READ_FEATURE(OmpBeginLoopDirective)
   READ_FEATURE(OmpBeginSectionsDirective)
@@ -541,7 +542,6 @@ struct NodeVisitor {
   READ_FEATURE(OpenMPCancellationPointConstruct)
   READ_FEATURE(OpenMPConstruct)
   READ_FEATURE(OpenMPCriticalConstruct)
-  READ_FEATURE(OpenMPDeclarativeAllocate)
   READ_FEATURE(OpenMPDeclarativeConstruct)
   READ_FEATURE(OpenMPDeclareReductionConstruct)
   READ_FEATURE(OpenMPDeclareSimdConstruct)
@@ -550,7 +550,6 @@ struct NodeVisitor {
   READ_FEATURE(OmpAtomicDefaultMemOrderClause)
   READ_FEATURE(OpenMPFlushConstruct)
   READ_FEATURE(OpenMPLoopConstruct)
-  READ_FEATURE(OpenMPExecutableAllocate)
   READ_FEATURE(OpenMPAllocatorsConstruct)
   READ_FEATURE(OpenMPRequiresConstruct)
   READ_FEATURE(OpenMPSimpleStandaloneConstruct)
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index a7398a4ef970f..de2716410d6cd 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -512,6 +512,7 @@ class ParseTreeDumper {
   NODE(parser, OmpAlignModifier)
   NODE(parser, OmpAllocateClause)
   NODE(OmpAllocateClause, Modifier)
+  NODE(parser, OmpAllocateDirective)
   NODE(parser, OmpAllocatorComplexModifier)
   NODE(parser, OmpAllocatorSimpleModifier)
   NODE(parser, OmpAlwaysModifier)
@@ -739,7 +740,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPCancellationPointConstruct)
   NODE(parser, OpenMPConstruct)
   NODE(parser, OpenMPCriticalConstruct)
-  NODE(parser, OpenMPDeclarativeAllocate)
   NODE(parser, OpenMPDeclarativeAssumes)
   NODE(parser, OpenMPDeclarativeConstruct)
   NODE(parser, OpenMPDeclareMapperConstruct)
@@ -748,7 +748,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPDeclareTargetConstruct)
   NODE(parser, OpenMPDepobjConstruct)
   NODE(parser, OpenMPDispatchConstruct)
-  NODE(parser, OpenMPExecutableAllocate)
   NODE(parser, OpenMPFlushConstruct)
   NODE(parser, OpenMPGroupprivate)
   NODE(parser, OpenMPLoopConstruct)
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index 49db091af93a7..8fa4a84aff06d 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -22,6 +22,7 @@
 #include <type_traits>
 #include <utility>
 #include <variant>
+#include <vector>
 
 namespace Fortran::parser::omp {
 
@@ -33,23 +34,6 @@ template <typename T> constexpr auto addr_if(const std::optional<T> &x) {
 }
 
 namespace detail {
-using D = llvm::omp::Directive;
-
-template <typename Construct> //
-struct ConstructId {
-  static constexpr llvm::omp::Directive id{D::OMPD_unknown};
-};
-
-#define MAKE_CONSTR_ID(Construct, Id) \
-  template <> struct ConstructId<Construct> { \
-    static constexpr llvm::omp::Directive id{Id}; \
-  }
-
-MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate);
-MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate);
-
-#undef MAKE_CONSTR_ID
-
 struct DirectiveNameScope {
   static OmpDirectiveName MakeName(CharBlock source = {},
       llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown) {
@@ -97,9 +81,6 @@ struct DirectiveNameScope {
     } else if constexpr (TupleTrait<T>) {
       if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) {
         return std::get<OmpBeginDirective>(x.t).DirName();
-      } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> ||
-          std::is_same_v<T, OpenMPExecutableAllocate>) {
-        return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id);
       } else {
         return GetFromTuple(
             x.t, std::make_index_sequence<std::tuple_size_v<decltype(x.t)>>{});
@@ -139,6 +120,9 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) {
   return detail::DirectiveNameScope::GetOmpDirectiveName(x);
 }
 
+const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x);
+const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x);
+
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause);
 
 template <typename T>
@@ -158,6 +142,13 @@ const OmpCombinerExpression *GetCombinerExpr(
     const OmpReductionSpecifier &rspec);
 const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init);
 
+struct OmpAllocateInfo {
+  std::vector<const OmpAllocateDirective *> dirs;
+  const ExecutionPartConstruct *body{nullptr};
+};
+
+OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x);
+
 } // namespace Fortran::parser::omp
 
 #endif // FORTRAN_PARSER_OPENMP_UTILS_H
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 4dd5e84f60dfe..8c7578f7a1941 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -5151,17 +5151,42 @@ struct OpenMPThreadprivate {
   CharBlock source;
 };
 
-// 2.11.3 allocate -> ALLOCATE (variable-name-list) [clause]
-struct OpenMPDeclarativeAllocate {
-  TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAllocate);
-  CharBlock source;
-  std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList> t;
+// Ref: [4.5:310-312], [5.0:156-158], [5.1:181-184], [5.2:176-177],
+//      [6.0:310-312]
+//
+// allocate-directive ->
+//    ALLOCATE (variable-list-item...) |            // since 4.5
+//    ALLOCATE (variable-list-item...)              // since 5.0, until 5.1
+//    ...
+//    allocate-stmt
+//
+// The first form is the "declarative-allocate", and is a declarative
+// directive. The second is the "executable-allocate" and is an executable
+// directive. The executable form was deprecated in 5.2.
+//
+// The executable-allocate consists of several ALLOCATE directives. Since
+// in the parse tree every type corresponding to a directive only corresponds
+// to a single directive, the executable form is represented by a sequence
+// of nested OmpAlocateDirectives, e.g.
+//    !$OMP ALLOCATE(x)
+//    !$OMP ALLOCATE(y)
+//    ALLOCATE(x, y)
+// will become
+//    OmpAllocateDirective
+//    |- ALLOCATE(x)            // begin directive
+//    `- OmpAllocateDirective   // block
+//       |- ALLOCATE(y)            // begin directive
+//       `- ALLOCATE(x, y)         // block
+//
+// The block in the declarative-allocate will be empty.
+struct OmpAllocateDirective : public OmpBlockConstruct {
+  INHERITED_TUPLE_CLASS_BOILERPLATE(OmpAllocateDirective, OmpBlockConstruct);
 };
 
 struct OpenMPDeclarativeConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct);
   CharBlock source;
-  std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes,
+  std::variant<OmpAllocateDirective, OpenMPDeclarativeAssumes,
       OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct,
       OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct,
       OmpDeclareVariantDirective, OpenMPGroupprivate, OpenMPThreadprivate,
@@ -5174,19 +5199,6 @@ struct OpenMPCriticalConstruct : public OmpBlockConstruct {
   INHERITED_TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct, OmpBlockConstruct);
 };
 
-// 2.11.3 allocate -> ALLOCATE [(variable-name-list)] [clause]
-//        [ALLOCATE (variable-name-list) [clause] [...]]
-//        allocate-statement
-//        clause -> allocator-clause
-struct OpenMPExecutableAllocate {
-  TUPLE_CLASS_BOILERPLATE(OpenMPExecutableAllocate);
-  CharBlock source;
-  std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList,
-      std::optional<std::list<OpenMPDeclarativeAllocate>>,
-      Statement<AllocateStmt>>
-      t;
-};
-
 // Ref: [5.2:180-181], [6.0:315]
 //
 // allocators-construct ->
@@ -5342,9 +5354,9 @@ struct OpenMPConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPConstruct);
   std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct,
       OpenMPSectionConstruct, OpenMPLoopConstruct, OmpBlockConstruct,
-      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct,
-      OpenMPUtilityConstruct, OpenMPExecutableAllocate,
-      OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, OpenMPCriticalConstruct>
+      OpenMPAtomicConstruct, OmpAllocateDirective, OpenMPDispatchConstruct,
+      OpenMPUtilityConstruct, OpenMPAllocatorsConstruct, OpenMPAssumeConstruct,
+      OpenMPCriticalConstruct>
       u;
 };
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 71067283d13f7..ad456d89bc432 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3503,12 +3503,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPUtilityConstruct &);
 
-static void
-genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-       semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-       const parser::OpenMPDeclarativeAllocate &declarativeAllocate) {
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OmpAllocateDirective &allocate) {
   if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
+    TODO(converter.getCurrentLocation(), "OmpAllocateDirective");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -3899,14 +3899,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
 }
 
-static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-                   semantics::SemanticsContext &semaCtx,
-                   lower::pft::Evaluation &eval,
-                   const parser::OpenMPExecutableAllocate &execAllocConstruct) {
-  if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
-}
-
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index a9de26ea09ff8..4374acbbe51bf 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1778,6 +1778,31 @@ struct OmpBlockConstructParser {
   llvm::omp::Directive dir_;
 };
 
+struct OmpDeclarativeAllocateParser {
+  using resultType = OmpAllocateDirective;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    constexpr llvm::omp::Directive dir{llvm::omp::Directive::OMPD_allocate};
+    if (auto &&begin{attempt(OmpBeginDirectiveParser(dir)).Parse(state)}) {
+      Block empty;
+      auto end{maybe(OmpEndDirectiveParser{dir}).Parse(state)};
+      return OmpAllocateDirective{std::move(*begin), std::move(empty),
+          llvm::transformOptional(std::move(*end),
+              [](auto &&s) { return OmpEndDirective(std::move(s)); })};
+    }
+    return std::nullopt;
+  }
+};
+
+struct OmpExecutableAllocateParser {
+  using resultType = OmpAllocateDirective;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    OmpStatementConstructParser p{llvm::omp::Directive::OMPD_allocate};
+    return construct<OmpAllocateDirective>(p).Parse(state);
+  }
+};
+
 TYPE_PARSER(sourced(construct<OpenMPAllocatorsConstruct>(
     OmpStatementConstructParser{llvm::omp::Directive::OMPD_allocators})))
 
@@ -2044,14 +2069,6 @@ TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{}))
 TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
     OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
 
-// 2.11.3 Executable Allocate directive
-TYPE_PARSER(sourced(construct<OpenMPExecutableAllocate>(
-    verbatim("ALLOCATE"_tok), maybe(parenthesized(Parser<OmpObjectList>{})),
-    Parser<OmpClauseList>{},
-    maybe(nonemptyList(startOmpLine >> Parser<OpenMPDeclarativeAllocate>{})) /
-        endOmpLine,
-    statement(allocateStmt))))
-
 // 2.8.2 Declare Simd construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareSimdConstruct>(
     predicated(Parser<OmpDirectiveName>{},
@@ -2077,13 +2094,6 @@ TYPE_PARSER(sourced( //
             IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >=
         Parser<OmpDirectiveSpecification>{})))
 
-// 2.11.3 Declarative Allocate directive
-TYPE_PARSER(
-    sourced(construct<OpenMPDeclarativeAllocate>(verbatim("ALLOCATE"_tok),
-        maybe(parenthesized(Parser<OmpObjectList>{})),
-        Parser<OmpClauseList>{})) /
-    lookAhead(endOmpLine / !statement(allocateStmt)))
-
 // Assumes Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclarativeAssumes>(
     predicated(OmpDirectiveNameParser{},
@@ -2106,7 +2116,7 @@ TYPE_PARSER(
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OmpDeclareVariantDirective>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
-                                Parser<OpenMPDeclarativeAllocate>{}) ||
+                                sourced(OmpDeclarativeAllocateParser{})) ||
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OpenMPGroupprivate>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
@@ -2194,6 +2204,8 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
         withMessage("expected OpenMP construct"_err_en_US,
             first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}),
+                construct<OpenMPConstruct>(
+                    sourced(OmpExecutableAllocateParser{})),
                 construct<OpenMPConstruct>(Parser<OmpBlockConstruct>{}),
                 // OmpBlockConstruct is attempted before
                 // OpenMPStandaloneConstruct to resolve !$OMP ORDERED
@@ -2201,9 +2213,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPDispatchConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{}))))
 
diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp
index 95ad3f60770f5..b9d3763cdd06d 100644
--- a/flang/lib/Parser/openmp-utils.cpp
+++ b/flang/lib/Parser/openmp-utils.cpp
@@ -22,6 +22,25 @@
 
 namespace Fortran::parser::omp {
 
+const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x) {
+  if (auto *y = std::get_if<SpecificationConstruct>(&x.u)) {
+    if (auto *z{std::get_if<common::Indirection<OpenMPDeclarativeConstruct>>(
+            &y->u)}) {
+      return &z->value();
+    }
+  }
+  return nullptr;
+}
+
+const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x) {
+  if (auto *y{std::get_if<ExecutableConstruct>(&x.u)}) {
+    if (auto *z{std::get_if<common::Indirection<OpenMPConstruct>>(&y->u)}) {
+      return &z->value();
+    }
+  }
+  return nullptr;
+}
+
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause) {
   // Clauses with OmpObjectList as its data member
   using MemberObjectListClauses = std::tuple<OmpClause::Copyin,
@@ -86,4 +105,24 @@ const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) {
   return nullptr;
 }
 
+static void SplitOmpAllocateHelper(
+    OmpAllocateInfo &n, const OmpAllocateDirective &x) {
+  n.dirs.push_back(&x);
+  const Block &body{std::get<Block>(x.t)};
+  if (!body.empty()) {
+    if (auto *omp{GetOmp(body.front())}) {
+      if (auto *ad{std::get_if<OmpAllocateDirective>(&omp->u)}) {
+        return SplitOmpAllocateHelper(n, *ad);
+      }
+    }
+    n.body = &body.front();
+  }
+}
+
+OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x) {
+  OmpAllocateInfo info;
+  SplitOmpAllocateHelper(info, x);
+  return info;
+}
+
 } // namespace Fortran::parser::omp
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 9255c4e1136bc..84123030195e9 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2482,30 +2482,8 @@ class UnparseVisitor {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
 
-  void Unparse(const OpenMPExecutableAllocate &x) {
-    const auto &fields =
-        std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-            x.t);
-    if (fields) {
-      for (const auto &decl : *fields) {
-        Walk(decl);
-      }
-    }
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AllocateStmt>>(x.t));
-  }
-  void Unparse(const OpenMPDeclarativeAllocate &x) {
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
+  void Unparse(const OmpAllocateDirective &x) {
+    Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
   void Unparse(const OpenMPAllocatorsConstruct &x) {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index c884658bf464a..a11c5250b1ab4 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -51,8 +51,6 @@ class CanonicalizationOfOmp {
     } // Block list
   }
 
-  void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); }
-
   // Pre-visit all constructs that have both a specification part and
   // an execution part, and store the connection between the two.
   bool Pre(parser::BlockConstruct &x) {
@@ -88,6 +86,7 @@ class CanonicalizationOfOmp {
 
   void Post(parser::SpecificationPart &spec) {
     CanonicalizeUtilityConstructs(spec);
+    CanonicalizeAllocateDirectives(spec);
   }
 
   void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); }
@@ -239,33 +238,138 @@ class CanonicalizationOfOmp {
     }
   }
 
-  void RewriteOmpAllocations(parser::ExecutionPart &body) {
-    // Rewrite leading declarative allocations so they are nested
-    // within their respective executable allocate directive
-    //
-    // Original:
-    //   ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //   ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //   ExecutionPartConstruct -> OpenMPExecutableAllocate
-    //
-    // After rewriting:
-    //   ExecutionPartConstruct -> OpenMPExecutableAllocate
-    //     ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //     ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    for (auto it = body.v.rbegin(); it != body.v.rend();) {
-      if (auto *exec = GetOmpIf<parser::OpenMPExecutableAllocate>(*(it++))) {
-        parser::OpenMPDeclarativeAllocate *decl;
-        std::list<parser::OpenMPDeclarativeAllocate> subAllocates;
-        while (it != body.v.rend() &&
-            (decl = GetOmpIf<parser::OpenMPDeclarativeAllocate>(*it))) {
-          subAllocates.push_front(std::move(*decl));
-          it = decltype(it)(body.v.erase(std::next(it).base()));
+  // Canonicalization of allocate directives
+  //
+  // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative
+  // one or an executable one. As usual in such cases, this poses a problem
+  // when the directive appears at the boundary between the specification part
+  // and the execution part.
+  // The executable form can actually consist of several adjacent directives,
+  // whereas the declarative form is always standalone. Additionally, the
+  // executable form must be associated with an allocate statement.
+  //
+  // The parser tries to parse declarative statements first, so in the
+  // following case, the two directives will be declarative, even though
+  // they should be treated as a single executable form:
+  //   integer, allocatable :: x, y   ! Specification
+  //   !$omp allocate(x)
+  //   !$omp allocate(y)
+  //   allocate(x, y)                 ! Execution
+  //
+  void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) {
+    auto found = blockForSpec_.find(&spec);
+    if (found == blockForSpec_.end()) {
+      // There is no corresponding execution part, so there is nothing to do.
+      return;
+    }
+    parser::Block &block = *found->second;
+
+    auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) {
+      if (auto *ec = std::get_if<parser::ExecutableConstruct>(&epc.u)) {
+        if (auto *as =
+                std::get_if<parser::Statement<parser::ActionStmt>>(&ec->u)) {
+          return std::holds_alternative<
+              common::Indirection<parser::AllocateStmt>>(as->statement.u);
+        }
+      }
+      return false;
+    };
+
+    if (!block.empty() && isAllocateStmt(block.front())) {
+      // There are two places where an OpenMP declarative construct can
+      // show up in the tuple in specification part:
+      // (1) in std::list<OpenMPDeclarativeConstruct>, or
+      // (2) in std::list<DeclarationConstruct>.
+      // The case (1) is only possible if the list (2) is empty.
+
+      auto &omps =
+          std::get<std::list<parser::OpenMPDeclarativeConstruct>>(spec.t);
+      auto &decls = std::get<std::list<parser::DeclarationConstruct>>(spec.t);
+
+      if (!decls.empty()) {
+        MakeExecutableAllocateFromDecls(decls, block);
+      } else {
+        MakeExecutableAllocateFromOmps(omps, block);
+      }
+    }
+  }
+
+  parser::ExecutionPartConstruct EmbedInExec(
+      parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc) {
+    // Nest current epc inside the allocate directive.
+    std::get<parser::Block>(alo->t).push_front(std::move(epc));
+    // Set the new epc to be the ExecutionPartConstruct made from
+    // the allocate directive.
+    parser::OpenMPConstruct opc(std::move(*alo));
+    common::Indirection<parser::OpenMPConstruct> ind(std::move(opc));
+    parser::ExecutableConstruct ec(std::move(ind));
+    return parser::ExecutionPartConstruct(std::move(ec));
+  }
+
+  void MakeExecutableAllocateFromDecls(
+      std::list<parser::DeclarationConstruct> &decls, parser::Block &body) {
+    using OpenMPDeclarativeConstruct =
+        common::Indirection<parser::OpenMPDeclarativeConstruct>;
+
+    auto getAllocate = [](parser::DeclarationConstruct *dc) {
+      if (auto *sc = std::get_if<parser::SpecificationConstruct>(&dc->u)) {
+        if (auto *odc = std::get_if<OpenMPDeclarativeConstruct>(&sc->u)) {
+          if (auto *alo =
+                  std::get_if<parser::OmpAllocateDirective>(&odc->value().u)) {
+            return alo;
+          }
+        }
+      }
+      return static_cast<parser::OmpAllocateDirective *>(nullptr);
+    };
+
+    std::list<parser::DeclarationConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) {
+        if (getAllocate(&*rit) == nullptr) {
+          return rit;
         }
-        if (!subAllocates.empty()) {
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              exec->t) = {std::move(subAllocates)};
+      }
+      return decls.rend();
+    }();
+
+    if (rlast != decls.rbegin()) {
+      // We have already checked that the first statement in body is
+      // ALLOCATE.
+      parser::ExecutionPartConstruct epc(std::move(body.front()));
+      for (auto rit = decls.rbegin(); rit != rlast; ++rit) {
+        epc = EmbedInExec(getAllocate(&*rit), std::move(epc));
+      }
+
+      body.pop_front();
+      body.push_front(std::move(epc));
+      decls.erase(rlast.base(), decls.end());
+    }
+  }
+
+  void MakeExecutableAllocateFromOmps(
+      std::list<parser::OpenMPDeclarativeConstruct> &omps,
+      parser::Block &body) {
+    using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct;
+
+    std::list<OpenMPDeclarativeConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) {
+        if (!std::holds_alternative<parser::OmpAllocateDirective>(rit->u)) {
+          return rit;
         }
       }
+      return omps.rend();
+    }();
+
+    if (rlast != omps.rbegin()) {
+      parser::ExecutionPartConstruct epc(std::move(body.front()));
+      for (auto rit = omps.rbegin(); rit != rlast; ++rit) {
+        epc = EmbedInExec(
+            &std::get<parser::OmpAllocateDirective>(rit->u), std::move(epc));
+      }
+
+      body.pop_front();
+      body.push_front(std::move(epc));
+      omps.erase(rlast.base(), omps.end());
     }
   }
 
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 3ea8e5b8cd2b0..e7e3f4d886b34 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -179,6 +179,22 @@ void OmpStructureChecker::Leave(const parser::BlockConstruct &x) {
   }
 }
 
+void OmpStructureChecker::Enter(const parser::SpecificationPart &) {
+  partStack_.push_back(PartKind::SpecificationPart);
+}
+
+void OmpStructureChecker::Leave(const parser::SpecificationPart &) {
+  partStack_.pop_back();
+}
+
+void OmpStructureChecker::Enter(const parser::ExecutionPart &) {
+  partStack_.push_back(PartKind::ExecutionPart);
+}
+
+void OmpStructureChecker::Leave(const parser::ExecutionPart &) {
+  partStack_.pop_back();
+}
+
 // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
@@ -720,18 +736,10 @@ template <typename Checker> struct DirectiveSpellingVisitor {
     return std::get<parser::OmpBeginDirective>(t).DirName();
   }
 
-  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
-    return false;
-  }
   bool Pre(const parser::OpenMPDispatchConstruct &x) {
     checker_(GetDirName(x.t).source, Directive::OMPD_dispatch);
     return false;
   }
-  bool Pre(const parser::OpenMPExecutableAllocate &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
-    return false;
-  }
   bool Pre(const parser::OpenMPAllocatorsConstruct &x) {
     checker_(GetDirName(x.t).source, Directive::OMPD_allocators);
     return false;
@@ -1667,11 +1675,6 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-static std::pair<const parser::AllocateStmt *, parser::CharBlock>
-getAllocateStmtAndSource(const parser::Statement<parser::AllocateStmt> &stmt) {
-  return {&stmt.statement, stmt.source};
-}
-
 static std::pair<const parser::AllocateStmt *, parser::CharBlock>
 getAllocateStmtAndSource(const parser::ExecutionPartConstruct *epc) {
   if (SourcedActionStmt as{GetActionStmt(epc)}) {
@@ -1699,19 +1702,12 @@ static UnorderedSymbolSet GetNonComponentSymbols(
   return symbols;
 }
 
-static const parser::OmpObjectList &GetObjectsOrEmpty(
-    const std::optional<parser::OmpObjectList> &maybeObjects) {
-  static parser::OmpObjectList empty{std::list<parser::OmpObject>{}};
-  if (maybeObjects) {
-    return *maybeObjects;
-  }
-  return empty;
-}
+void OmpStructureChecker::CheckIndividualAllocateDirective(
+    const parser::OmpAllocateDirective &x, bool isExecutable) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
 
-void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
-    const parser::OmpObjectList &objects,
-    const parser::OmpClauseList &clauses) {
-  const Scope &thisScope{context_.FindScope(source)};
+  const Scope &thisScope{context_.FindScope(dirName.source)};
 
   auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) {
     // Return "true" if the ALLOCATOR clause was provided with an argument
@@ -1740,7 +1736,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
   const auto *allocator{[&]() {
     // Can't use FindClause in Enter (because clauses haven't been visited
     // yet).
-    for (const parser::OmpClause &c : clauses.v) {
+    for (const parser::OmpClause &c : beginSpec.Clauses().v) {
       if (c.Id() == llvm::omp::Clause::OMPC_allocator) {
         return &c;
       }
@@ -1752,7 +1748,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
     bool hasDynAllocators{
         HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
     if (!allocator && !hasDynAllocators) {
-      context_.Say(source,
+      context_.Say(dirName.source,
           "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
     }
   }
@@ -1766,7 +1762,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
           : "a named common block or has SAVE attribute"};
 
   auto checkSymbol{[&](const Symbol &symbol, parser::CharBlock source) {
-    if (!inExecutableAllocate_) {
+    if (!isExecutable) {
       // For structure members, the scope is the derived type, which is
       // never "this" scope. Ignore this check for members, they will be
       // flagged anyway.
@@ -1802,37 +1798,130 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
     }
   }};
 
-  for (const parser::OmpObject &object : objects.v) {
-    parser::CharBlock objSource{[&]() {
-      if (auto &&maybeSource{GetObjectSource(object)}) {
-        return *maybeSource;
-      }
-      return source;
-    }()};
-    if (const Symbol *symbol{GetObjectSymbol(object)}) {
+  for (const parser::OmpArgument &arg : beginSpec.Arguments().v) {
+    const parser::OmpObject *object{GetArgumentObject(arg)};
+    if (!object) {
+      context_.Say(arg.source,
+          "An argument to ALLOCATE directive must be a variable list item"_err_en_US);
+      continue;
+    }
+
+    if (const Symbol *symbol{GetObjectSymbol(*object)}) {
       if (!IsTypeParamInquiry(*symbol)) {
-        checkSymbol(*symbol, objSource);
+        checkSymbol(*symbol, arg.source);
+      }
+      CheckVarIsNotPartOfAnotherVar(dirName.source, *object);
+    }
+  }
+}
+
+void OmpStructureChecker::CheckExecutableAllocateDirective(
+    const parser::OmpAllocateDirective &x) {
+  parser::omp::OmpAllocateInfo info{SplitOmpAllocate(x)};
+
+  auto [allocStmt, allocSource]{getAllocateStmtAndSource(info.body)};
+  if (!allocStmt) {
+    // This has been diagnosed already.
+    return;
+  }
+
+  UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)};
+  SymbolSourceMap directiveSyms;
+  bool hasEmptyList{false};
+
+  for (const parser::OmpAllocateDirective *ompAlloc : info.dirs) {
+    const parser::OmpDirectiveSpecification &spec{DEREF(ompAlloc).BeginDir()};
+    if (spec.Arguments().v.empty()) {
+      if (hasEmptyList && info.dirs.size() > 1) {
+        context_.Say(spec.DirName().source,
+            "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US);
+      }
+      hasEmptyList = true;
+    }
+    for (const parser::OmpArgument &arg : spec.Arguments().v) {
+      if (auto *sym{GetArgumentSymbol(arg)}) {
+        // Ignore these checks for structure members. They are not allowed
+        // in the first place, so don't tell the users that they need to
+        // be specified somewhere,
+        if (IsStructureComponent(*sym)) {
+          continue;
+        }
+        if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) {
+          parser::MessageFormattedText txt(
+              "A list item on an executable ALLOCATE may only be specified once"_err_en_US);
+          parser::Message message(arg.source, txt);
+          message.Attach(f->second, "The list item was specified here"_en_US);
+          context_.Say(std::move(message));
+        } else {
+          directiveSyms.insert(std::make_pair(sym, arg.source));
+        }
+
+        if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) {
+          context_
+              .Say(arg.source,
+                  "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US)
+              .Attach(allocSource, "The ALLOCATE statement"_en_US);
+        }
       }
-      CheckVarIsNotPartOfAnotherVar(source, object);
     }
   }
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+void OmpStructureChecker::Enter(const parser::OmpAllocateDirective &x) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
+  PushContextAndClauseSets(dirName.source, dirName.v);
+  ++allocateDirectiveLevel;
+
+  bool isExecutable{partStack_.back() == PartKind::ExecutionPart};
 
-  if (!inExecutableAllocate_) {
-    const auto &dir{std::get<parser::Verbatim>(x.t)};
-    const auto &clauses{std::get<parser::OmpClauseList>(x.t)};
-    const auto &objects{
-        GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))};
+  unsigned version{context_.langOptions().OpenMPVersion};
+  if (isExecutable && allocateDirectiveLevel == 1 && version >= 52) {
+    context_.Warn(common::UsageWarning::OpenMPUsage, dirName.source,
+        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
+  }
 
-    CheckAllocateDirective(dir.source, objects, clauses);
+  CheckIndividualAllocateDirective(x, isExecutable);
+
+  if (isExecutable) {
+    auto isOmpAllocate{[](const parser::ExecutionPartConstruct &epc) {
+      if (auto *omp{GetOmp(epc)}) {
+        auto odn{GetOmpDirectiveName(*omp)};
+        return odn.v == llvm::omp::Directive::OMPD_allocate;
+      }
+      return false;
+    }};
+
+    auto &body{std::get<parser::Block>(x.t)};
+    // The parser should put at most one statement in the body.
+    assert(body.size() <= 1 && "Multiple statements in allocate");
+    if (body.empty()) {
+      context_.Say(dirName.source,
+          "An executable ALLOCATE directive must be associated with an ALLOCATE statement"_err_en_US);
+    } else {
+      const parser::ExecutionPartConstruct &first{body.front()};
+      auto [allocStmt, _]{getAllocateStmtAndSource(&body.front())};
+      if (!isOmpAllocate(first) && !allocStmt) {
+        parser::CharBlock source{[&]() {
+          if (auto &&maybeSource{parser::GetSource(first)}) {
+            return *maybeSource;
+          }
+          return dirName.source;
+        }()};
+        context_.Say(source,
+            "The statement associated with executable ALLOCATE directive must be an ALLOCATE statement"_err_en_US);
+      }
+    }
   }
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+void OmpStructureChecker::Leave(const parser::OmpAllocateDirective &x) {
+  bool isExecutable{partStack_.back() == PartKind::ExecutionPart};
+  if (isExecutable && allocateDirectiveLevel == 1) {
+    CheckExecutableAllocateDirective(x);
+  }
+
+  --allocateDirectiveLevel;
   dirContext_.pop_back();
 }
 
@@ -2135,112 +2224,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) {
   }
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
-  inExecutableAllocate_ = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-
-  unsigned version{context_.langOptions().OpenMPVersion};
-  if (version >= 52) {
-    context_.Warn(common::UsageWarning::OpenMPUsage, x.source,
-        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
-  }
-
-  auto &objects{
-      GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))};
-  auto &clauses{std::get<parser::OmpClauseList>(x.t)};
-
-  CheckAllocateDirective(
-      std::get<parser::Verbatim>(x.t).source, objects, clauses);
-
-  if (const auto &subDirs{
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              x.t)}) {
-    for (const auto &dalloc : *subDirs) {
-      const auto &dir{std::get<parser::Verbatim>(x.t)};
-      const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)};
-      const auto &objects{GetObjectsOrEmpty(
-          std::get<std::optional<parser::OmpObjectList>>(dalloc.t))};
-      CheckAllocateDirective(dir.source, objects, clauses);
-    }
-  }
-}
-
-void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
-  auto [allocStmt, allocSource]{getAllocateStmtAndSource(
-      std::get<parser::Statement<parser::AllocateStmt>>(x.t))};
-
-  UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)};
-  SymbolSourceMap directiveSyms;
-  auto &objects{
-      GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))};
-  auto emptyListCount{static_cast<size_t>(objects.v.empty())};
-  auto checkObjects{[&](const parser::OmpObjectList &objects,
-                        parser::CharBlock dirSource,
-                        parser::CharBlock allocSource) {
-    for (const parser::OmpObject &object : objects.v) {
-      parser::CharBlock objSource{[&]() {
-        if (auto &&maybeSource{GetObjectSource(object)}) {
-          return *maybeSource;
-        }
-        return dirSource;
-      }()};
-      if (auto *sym{GetObjectSymbol(object)}) {
-        // Ignore these checks for structure members. They are not allowed
-        // in the first place, so don't tell the users that they nened to
-        // be specified somewhere,
-        if (IsStructureComponent(*sym)) {
-          continue;
-        }
-        if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) {
-          parser::MessageFormattedText txt(
-              "A list item on an executable ALLOCATE may only be specified once"_err_en_US);
-          parser::Message message(objSource, txt);
-          message.Attach(f->second, "The list item was specified here"_en_US);
-          context_.Say(std::move(message));
-        } else {
-          directiveSyms.insert(std::make_pair(sym, objSource));
-        }
-
-        if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) {
-          context_
-              .Say(objSource,
-                  "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US)
-              .Attach(allocSource, "The ALLOCATE statement"_en_US);
-        }
-      }
-    }
-  }};
-
-  checkObjects(objects, std::get<parser::Verbatim>(x.t).source, allocSource);
-
-  const auto &subDirs{
-      std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-          x.t)};
-  if (!subDirs) {
-    inExecutableAllocate_ = false;
-    dirContext_.pop_back();
-    return;
-  }
-
-  for (const parser::OpenMPDeclarativeAllocate &ompAlloc : *subDirs) {
-    parser::CharBlock dirSource{std::get<parser::Verbatim>(ompAlloc.t).source};
-    auto &objects{GetObjectsOrEmpty(
-        std::get<std::optional<parser::OmpObjectList>>(ompAlloc.t))};
-    if (objects.v.empty()) {
-      // Only show the message once per construct.
-      if (++emptyListCount == 2 && subDirs->size() >= 1) {
-        context_.Say(dirSource,
-            "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US);
-      }
-    }
-    checkObjects(objects, dirSource, allocSource);
-  }
-
-  inExecutableAllocate_ = false;
-  dirContext_.pop_back();
-}
-
 void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
   const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
   const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 6feb1d149c4fd..1b84bc5dda471 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -82,6 +82,11 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   bool Enter(const parser::BlockConstruct &);
   void Leave(const parser::BlockConstruct &);
 
+  void Enter(const parser::SpecificationPart &);
+  void Leave(const parser::SpecificationPart &);
+  void Enter(const parser::ExecutionPart &);
+  void Leave(const parser::ExecutionPart &);
+
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
   void Enter(const parser::OpenMPInteropConstruct &);
@@ -113,8 +118,8 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void Leave(const parser::OmpDeclareVariantDirective &);
   void Enter(const parser::OpenMPDeclareSimdConstruct &);
   void Leave(const parser::OpenMPDeclareSimdConstruct &);
-  void Enter(const parser::OpenMPDeclarativeAllocate &);
-  void Leave(const parser::OpenMPDeclarativeAllocate &);
+  void Enter(const parser::OmpAllocateDirective &);
+  void Leave(const parser::OmpAllocateDirective &);
   void Enter(const parser::OpenMPDeclareMapperConstruct &);
   void Leave(const parser::OpenMPDeclareMapperConstruct &);
   void Enter(const parser::OpenMPDeclareReductionConstruct &);
@@ -129,8 +134,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void Leave(const parser::OmpErrorDirective &);
   void Enter(const parser::OmpNothingDirective &);
   void Leave(const parser::OmpNothingDirective &);
-  void Enter(const parser::OpenMPExecutableAllocate &);
-  void Leave(const parser::OpenMPExecutableAllocate &);
   void Enter(const parser::OpenMPAllocatorsConstruct &);
   void Leave(const parser::OpenMPAllocatorsConstruct &);
   void Enter(const parser::OpenMPRequiresConstruct &);
@@ -263,9 +266,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   bool CheckTargetBlockOnlyTeams(const parser::Block &);
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
   void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock);
-  void CheckAllocateDirective(parser::CharBlock source,
-      const parser::OmpObjectList &objects,
-      const parser::OmpClauseList &clauses);
+  void CheckIndividualAllocateDirective(
+      const parser::OmpAllocateDirective &x, bool isExecutable);
+  void CheckExecutableAllocateDirective(const parser::OmpAllocateDirective &x);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
   void CheckIteratorModifier(const parser::OmpIterator &x);
@@ -373,7 +376,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   };
   int directiveNest_[LastType + 1] = {0};
 
-  bool inExecutableAllocate_{false};
+  int allocateDirectiveLevel{0};
   parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
@@ -382,6 +385,14 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   std::vector<LoopConstruct> loopStack_;
   // Scopes for scoping units.
   std::vector<const Scope *> scopeStack_;
+
+  enum class PartKind : int {
+    // There are also other "parts", such as internal-subprogram-part, etc,
+    // but we're keeping track of these two for now.
+    SpecificationPart,
+    ExecutionPart,
+  };
+  std::vector<PartKind> partStack_;
 };
 
 /// Find a duplicate entry in the range, and return an iterator to it.
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 03c8cb0065fd8..deb57e005a352 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -415,6 +415,18 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return true;
   }
 
+  bool Pre(const parser::SpecificationPart &) {
+    partStack_.push_back(PartKind::SpecificationPart);
+    return true;
+  }
+  void Post(const parser::SpecificationPart &) { partStack_.pop_back(); }
+
+  bool Pre(const parser::ExecutionPart &) {
+    partStack_.push_back(PartKind::ExecutionPart);
+    return true;
+  }
+  void Post(const parser::ExecutionPart &) { partStack_.pop_back(); }
+
   bool Pre(const parser::InternalSubprogram &) {
     // Clear the labels being tracked in the previous scope
     ClearLabels();
@@ -639,8 +651,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPThreadprivate &);
   void Post(const parser::OpenMPThreadprivate &) { PopContext(); }
 
-  bool Pre(const parser::OpenMPDeclarativeAllocate &);
-  void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); }
+  bool Pre(const parser::OmpAllocateDirective &);
 
   bool Pre(const parser::OpenMPAssumeConstruct &);
   void Post(const parser::OpenMPAssumeConstruct &) { PopContext(); }
@@ -651,9 +662,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPDispatchConstruct &);
   void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); }
 
-  bool Pre(const parser::OpenMPExecutableAllocate &);
-  void Post(const parser::OpenMPExecutableAllocate &);
-
   bool Pre(const parser::OpenMPAllocatorsConstruct &);
   void Post(const parser::OpenMPAllocatorsConstruct &);
 
@@ -998,6 +1006,14 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
       targetLabels_;
   parser::CharBlock currentStatementSource_;
 
+  enum class PartKind : int {
+    // There are also other "parts", such as internal-subprogram-part, etc,
+    // but we're keeping track of these two for now.
+    SpecificationPart,
+    ExecutionPart,
+  };
+  std::vector<PartKind> partStack_;
+
   void AddAllocateName(const parser::Name *&object) {
     allocateNames_.push_back(object);
   }
@@ -2558,11 +2574,24 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) {
   return true;
 }
 
-bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) {
+bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) {
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
-  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
-    ResolveOmpObjectList(*list, Symbol::Flag::OmpDeclarativeAllocateDirective);
+  assert(!partStack_.empty() && "Misplaced directive");
+
+  auto ompFlag{partStack_.back() == PartKind::SpecificationPart
+          ? Symbol::Flag::OmpDeclarativeAllocateDirective
+          : Symbol::Flag::OmpExecutableAllocateDirective};
+
+  parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)};
+  for (const parser::OmpAllocateDirective *ad : info.dirs) {
+    for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) {
+      if (auto *object{omp::GetArgumentObject(arg)}) {
+        ResolveOmpObject(*object, ompFlag);
+      }
+    }
   }
+
+  PopContext();
   return false;
 }
 
@@ -2581,15 +2610,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
   return true;
 }
 
-bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
-  PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
-  const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (list) {
-    ResolveOmpObjectList(*list, Symbol::Flag::OmpExecutableAllocateDirective);
-  }
-  return true;
-}
-
 bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) {
   const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
   PushContext(x.source, dirSpec.DirId());
@@ -2661,10 +2681,6 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) {
   return false;
 }
 
-void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) {
-  PopContext();
-}
-
 void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
   PopContext();
 }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 220f1c96b9823..a2062ef28d52c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1700,12 +1700,12 @@ class OmpVisitor : public virtual DeclarationVisitor {
   void Post(const parser::OpenMPDeclareTargetConstruct &) {
     SkipImplicitTyping(false);
   }
-  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
+  bool Pre(const parser::OmpAllocateDirective &x) {
     AddOmpSourceRange(x.source);
     SkipImplicitTyping(true);
     return true;
   }
-  void Post(const parser::OpenMPDeclarativeAllocate &) {
+  void Post(const parser::OmpAllocateDirective &) {
     SkipImplicitTyping(false);
     messageHandler().set_currStmtSource(std::nullopt);
   }
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
index 8daf20e1ae400..fec146ac70313 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
@@ -5,6 +5,6 @@
 program main
   integer :: x
 
-  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OmpAllocateDirective
   !$omp allocate(x) align(32)
 end
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
index e83b433d0fda0..3307eb2505b71 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
@@ -5,6 +5,6 @@
 program main
   integer :: x, y
 
-  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OmpAllocateDirective
   !$omp allocate(x, y)
 end
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
index 0d247cd1ed945..d799aa10a82ff 100644
--- a/flang/test/Parser/OpenMP/allocate-align-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -16,27 +16,33 @@ program allocate_align_tree
     allocate(j(z), xarray(t))
 end program allocate_align_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
-!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!CHECK-NEXT: | | | AttrSpec -> Allocatable
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'j'
+!CHECK:      DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!CHECK-NEXT: | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!CHECK-NEXT: | AttrSpec -> Allocatable
+!CHECK-NEXT: | EntityDecl
+!CHECK-NEXT: | | Name = 'j'
 
+!CHECK:      ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'j'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
+!CHECK-NEXT: | | | LiteralConstant -> IntLiteralConstant = '16'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
+!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '32'
+!CHECK-NEXT: | | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
-!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
-!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
-!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
-!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
-!CHECK-NEXT: | | | AllocateStmt
+!UNPARSE:      !$OMP ALLOCATE(j) ALIGN(16_4)
+!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALIGN(32_4) ALLOCATOR(2_8)
+!UNPARSE-NEXT:  ALLOCATE(j(z), xarray(t))
 
-!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4)
-!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8)
-!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t))
diff --git a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
index afcaf44b09f03..800e4a57d5f0e 100644
--- a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
@@ -17,33 +17,48 @@ program allocate_tree
     allocate (w, xarray(4), zarray(5, f))
 end program allocate_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'f'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | Designator -> DataRef -> Name =
+!CHECK:      | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
 !CHECK-NEXT: | ExecutionPart -> Block
 !CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'f=2_4'
 !CHECK-NEXT: | | | Variable = 'f'
 !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'f'
 !CHECK-NEXT: | | | Expr = '2_4'
 !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpClauseList ->
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | AllocateStmt
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | | | | Flags = None
+!CHECK-NEXT: | | | | | Block
+!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
+!CHECK-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | | | | | Flags = None
+!CHECK-NEXT: | | | | | | | Block
+!CHECK-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | | | | | OmpClauseList ->
+!CHECK-NEXT: | | | | | | | | | | Flags = None
+!CHECK-NEXT: | | | | | | | | | Block
+!CHECK-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90
index bf413d591baf2..021d8104a7e62 100644
--- a/flang/test/Parser/OpenMP/allocate-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree.f90
@@ -7,52 +7,54 @@
 
 program allocate_tree
     use omp_lib
-    integer, allocatable :: w, xarray(:), zarray(:, :)
-    integer :: z, t
+    integer, allocatable :: xarray(:), zarray(:, :)
+    integer :: z, t, w
+!$omp allocate(w) allocator(omp_const_mem_alloc)
     t = 2
     z = 3
-!$omp allocate(w) allocator(omp_const_mem_alloc)
 !$omp allocate(xarray) allocator(omp_large_cap_mem_alloc)
 !$omp allocate(zarray) allocator(omp_default_mem_alloc)
 !$omp allocate
-    allocate(w, xarray(4), zarray(t, z))
+    allocate(xarray(4), zarray(t, z))
 end program allocate_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
-!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!CHECK-NEXT: | | | AttrSpec -> Allocatable
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'w'
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'xarray'
-!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1'
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'zarray'
-!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2'
-
+!CHECK:      DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8'
+!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
 
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpClauseList ->
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | AllocateStmt
+!CHECK:      ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | OmpClauseList ->
+!CHECK-NEXT: | | | | | | Flags = None
+!CHECK-NEXT: | | | | | Block
+!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 
-!UNPARSE: !$OMP ALLOCATE (w) ALLOCATOR(3_8)
-!UNPARSE-NEXT: !$OMP ALLOCATE (xarray) ALLOCATOR(2_8)
-!UNPARSE-NEXT: !$OMP ALLOCATE (zarray) ALLOCATOR(1_8)
+!UNPARSE:      !$OMP ALLOCATE(w) ALLOCATOR(3_8)
+!UNPARSE-NEXT:   t=2_4
+!UNPARSE-NEXT:   z=3_4
+!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALLOCATOR(2_8)
+!UNPARSE-NEXT: !$OMP ALLOCATE(zarray) ALLOCATOR(1_8)
 !UNPARSE-NEXT: !$OMP ALLOCATE
-!UNPARSE-NEXT: ALLOCATE(w, xarray(4_4), zarray(t,z))
+!UNPARSE-NEXT:  ALLOCATE(xarray(4_4), zarray(t,z))
diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90
index 63cc35cd55082..b61a97150cad2 100644
--- a/flang/test/Parser/OpenMP/allocate-unparse.f90
+++ b/flang/test/Parser/OpenMP/allocate-unparse.f90
@@ -34,19 +34,19 @@ program allocate_unparse
 end program allocate_unparse
 
 !CHECK:!$OMP ALLOCATE{{[ ]*$}}
-!CHECK:!$OMP ALLOCATE (x,y)
-!CHECK:!$OMP ALLOCATE (x,y) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (a,b)
+!CHECK:!$OMP ALLOCATE(x, y)
+!CHECK:!$OMP ALLOCATE(x, y) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(a, b)
 !CHECK:ALLOCATE(darray(a,b))
 !CHECK:!$OMP ALLOCATE ALLOCATOR(omp_default_mem_alloc)
 !CHECK:ALLOCATE(darray(a,b))
-!CHECK:!$OMP ALLOCATE (a,b) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(a, b) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:ALLOCATE(darray(a,b))
-!CHECK:!$OMP ALLOCATE (t) ALLOCATOR(omp_const_mem_alloc)
-!CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (n)
-!CHECK:!$OMP ALLOCATE (j) ALIGN(16)
+!CHECK:!$OMP ALLOCATE(t) ALLOCATOR(omp_const_mem_alloc)
+!CHECK:!$OMP ALLOCATE(z) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(m) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(n)
+!CHECK:!$OMP ALLOCATE(j) ALIGN(16)
 !CHECK:ALLOCATE(darray(z,t))
 !CHECK:!$OMP ALLOCATE{{[ ]*$}}
 !CHECK:ALLOCATE(darray(a,b))
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 88bcd6d2f1008..4a1e60cf73fff 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -11,9 +11,9 @@ program allocate_align_tree
     integer :: z, t, xx
     t = 2
     z = 3
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: Must be a constant value
 !$omp allocate(j) align(xx)
-    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: The alignment should be positive
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90
index 18a14b825f00d..e34125b392bda 100644
--- a/flang/test/Semantics/OpenMP/allocate-directive.f90
+++ b/flang/test/Semantics/OpenMP/allocate-directive.f90
@@ -11,7 +11,7 @@
 integer, allocatable :: a, b, m, n, t, z
 !$omp allocate(x, y)
 !$omp allocate(x, y) allocator(omp_default_mem_alloc)
-
+    continue
 !$omp allocate(a, b)
     allocate ( a, b )
 
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 229fd4d6c3f95..5fe4efdd106d9 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -17,7 +17,7 @@ subroutine sema()
 
     !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
     !$omp allocate(y)
-        print *, a
+    print *, a
 
     !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !$omp allocate(x) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90
index 8f0579e810bb9..a1e684796edb2 100644
--- a/flang/test/Semantics/OpenMP/allocate02.f90
+++ b/flang/test/Semantics/OpenMP/allocate02.f90
@@ -16,6 +16,7 @@ subroutine allocate()
   !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive
   !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc)
 
+  continue
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
       allocate ( darray(a, b) )
 
diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90
index e35115f3897cc..3609f38eb6ee7 100644
--- a/flang/test/Semantics/OpenMP/allocate03.f90
+++ b/flang/test/Semantics/OpenMP/allocate03.f90
@@ -17,6 +17,7 @@ subroutine allocate()
 
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(my_var%array)
+  continue
 
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index 9b57322bbadc6..272094aaaeec2 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -13,7 +13,7 @@ subroutine allocate()
 
   !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
-
+  continue
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
     allocate(darray(a, b))
 
diff --git a/flang/test/Semantics/OpenMP/allocate10.f90 b/flang/test/Semantics/OpenMP/allocate10.f90
index a9db7330296ba..0a9e85b8ae2fe 100644
--- a/flang/test/Semantics/OpenMP/allocate10.f90
+++ b/flang/test/Semantics/OpenMP/allocate10.f90
@@ -4,8 +4,8 @@ subroutine f00
   integer, allocatable :: x, y
 
   continue
-  !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items
   !$omp allocate
+  !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items
   !$omp allocate
   allocate(x, y)
 end
diff --git a/flang/test/Semantics/OpenMP/allocate12.f90 b/flang/test/Semantics/OpenMP/allocate12.f90
new file mode 100644
index 0000000000000..2b3b510fbf40c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate12.f90
@@ -0,0 +1,16 @@
+!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+
+subroutine f00
+  integer, allocatable :: x
+  continue
+  !ERROR: An executable ALLOCATE directive must be associated with an ALLOCATE statement
+  !$omp allocate(x)
+end
+
+subroutine f01
+  integer, allocatable :: x
+  continue
+  !$omp allocate(x)
+  !ERROR: The statement associated with executable ALLOCATE directive must be an ALLOCATE statement
+  continue
+end

From 8395343811ecddb14cfd0ebf273d9d588bcaa8e7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 3 Nov 2025 13:44:54 +0000
Subject: [PATCH 008/313] [X86] combineTruncate - trunc(srl(load(p),amt)) ->
 load(p+amt/8) - ensure we merge the full / truncated load chains (#166160)

The full load might persist so ensure that the chains are merged into a token factor instead of just transferring the chain to the new load

Noticed while trying to fix the regression reported from #165540
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 133406bd8e0d7..e5b2743f602da 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54529,8 +54529,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
         SDValue NewLoad =
             DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
                         Align(), Ld->getMemOperand()->getFlags());
-        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
-                                      NewLoad.getValue(1));
+        DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
         return NewLoad;
       }
     }

From 5256db32bb14a358a23251f714cc39d4d761fef1 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 3 Nov 2025 06:01:46 -0800
Subject: [PATCH 009/313] [CI] Use action from FAILED: in ninja log parser

There are cases where the progress indicator does not align at all with
the action printed in FAILED:. Default to just using the action there so
that we can ensure the results are accurate. This still leaves some
issues where we are not capturing all the log lines, but I'll look at
those in a future crash. Those are also less critical since they do not
cause the script to patch.

Partially fixes #165131.

Reviewers: DavidSpickett

Reviewed By: DavidSpickett

Pull Request: https://github.com/llvm/llvm-project/pull/166100
---
 .ci/generate_test_report_lib.py      | 10 ++++---
 .ci/generate_test_report_lib_test.py | 42 +++++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 36c95852452ac..7820fbda803d7 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -41,10 +41,12 @@ def _parse_ninja_log(ninja_log: list[str]) -> list[tuple[str, str]]:
         # touch test/4.stamp
         #
         # index will point to the line that starts with Failed:. The progress
-        # indicator is the line before this ([4/5] test/4.stamp) and contains a pretty
-        # printed version of the target being built (test/4.stamp). We use this line
-        # and remove the progress information to get a succinct name for the target.
-        failing_action = ninja_log[index - 1].split("] ")[1]
+        # indicator is sometimes the line before this ([4/5] test/4.stamp) and
+        # will contain a pretty printed version of the target being built
+        # (test/4.stamp) when accurate. We instead parse the failed line rather
+        # than the progress indicator as the progress indicator may not be
+        # aligned with the failure.
+        failing_action = ninja_log[index].split("FAILED: ")[1]
         failure_log = []
         while (
             index < len(ninja_log)
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index 431e10da6405a..4068a3b7300a4 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -39,7 +39,7 @@ def test_find_failure_ninja_logs(self):
         self.assertEqual(
             failures[0],
             (
-                "test/4.stamp",
+                "touch test/4.stamp",
                 dedent(
                     """\
                     FAILED: touch test/4.stamp
@@ -77,7 +77,7 @@ def test_ninja_log_end(self):
         self.assertEqual(
             failures[0],
             (
-                "test/3.stamp",
+                "touch test/3.stamp",
                 dedent(
                     """\
                     FAILED: touch test/3.stamp
@@ -106,7 +106,7 @@ def test_ninja_log_multiple_failures(self):
         self.assertEqual(
             failures[0],
             (
-                "test/2.stamp",
+                "touch test/2.stamp",
                 dedent(
                     """\
                     FAILED: touch test/2.stamp
@@ -117,7 +117,7 @@ def test_ninja_log_multiple_failures(self):
         self.assertEqual(
             failures[1],
             (
-                "test/4.stamp",
+                "touch test/4.stamp",
                 dedent(
                     """\
                     FAILED: touch test/4.stamp
@@ -150,7 +150,7 @@ def test_ninja_log_runtimes_failure(self):
         self.assertEqual(
             failures[0],
             (
-                "test/2.stamp",
+                "touch test/2.stamp",
                 dedent(
                     """\
                     FAILED: touch test/2.stamp
@@ -159,6 +159,34 @@ def test_ninja_log_runtimes_failure(self):
             ),
         )
 
+    # Test that we correctly handle cases where the FAILED: line does not
+    # match up with the progress indicator.
+    def test_ninja_log_mismatched_failed(self):
+        failures = generate_test_report_lib.find_failure_in_ninja_logs(
+            [
+                [
+                    "[1/5] test/1.stamp",
+                    "[2/5] test/2.stamp",
+                    "ModuleNotFoundError: No module named 'mount_langley'",
+                    "FAILED: tools/check-langley",
+                    "Wow! This system is really broken!",
+                    "[5/5] test/5.stamp",
+                ]
+            ]
+        )
+        self.assertEqual(len(failures), 1)
+        self.assertEqual(
+            failures[0],
+            (
+                "tools/check-langley",
+                dedent(
+                    """\
+                    FAILED: tools/check-langley
+                    Wow! This system is really broken!"""
+                ),
+            ),
+        )
+
     def test_title_only(self):
         self.assertEqual(
             generate_test_report_lib.generate_report("Foo", 0, [], []),
@@ -448,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                     All tests passed but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
-                    <summary>test/2.stamp</summary>
+                    <summary>touch test/2.stamp</summary>
 
                     ```
                     FAILED: touch test/2.stamp
@@ -456,7 +484,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                     ```
                     </details>
                     <details>
-                    <summary>test/4.stamp</summary>
+                    <summary>touch test/4.stamp</summary>
 
                     ```
                     FAILED: touch test/4.stamp

From bf2f5773d9d50d74a4cdeac4d88762e2d9776175 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Mon, 3 Nov 2025 15:22:21 +0100
Subject: [PATCH 010/313] [Clang] Make the AS of llvm.compiler.used & llvm.used
 elements addrspace(0) (#164432)

By convention the AS of the elements of `llvm.compiler.used` &
`llvm.used` is 0. However, the AS of `CGM.Int8PtrTy` is not always 0.

This leaves some LLVM helpers
(`appendToUsed/appendToCompilerUsed/removeFromUsedLists`) unusable.

This patch makes the AS of the elements of these variables to be 0.

This PR is related to https://github.com/llvm/llvm-project/pull/162660
---
 clang/lib/CodeGen/CodeGenModule.cpp           | 12 ++--
 .../embed-bitcode-marker-with-nonzero-as.c    |  2 +-
 .../llvm_compiler_used_elements_are_unqual.c  | 64 +++++++++++++++++++
 3 files changed, 71 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 0fea57b2e1799..af5be95aec1cd 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3331,18 +3331,18 @@ static void emitUsed(CodeGenModule &CGM, StringRef Name,
   if (List.empty())
     return;
 
+  llvm::PointerType *UnqualPtr =
+      llvm::PointerType::getUnqual(CGM.getLLVMContext());
+
   // Convert List to what ConstantArray needs.
   SmallVector<llvm::Constant*, 8> UsedArray;
   UsedArray.resize(List.size());
   for (unsigned i = 0, e = List.size(); i != e; ++i) {
-    UsedArray[i] =
-        llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-            cast<llvm::Constant>(&*List[i]), CGM.Int8PtrTy);
+    UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+        cast<llvm::Constant>(&*List[i]), UnqualPtr);
   }
 
-  if (UsedArray.empty())
-    return;
-  llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size());
+  llvm::ArrayType *ATy = llvm::ArrayType::get(UnqualPtr, UsedArray.size());
 
   auto *GV = new llvm::GlobalVariable(
       CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage,
diff --git a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
index df7118859c764..8af9708a1bfb8 100644
--- a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
+++ b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
@@ -3,6 +3,6 @@
 
 // CHECK: @llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1
 // CHECK-NEXT: @llvm.cmdline = private addrspace(1) constant [{{[0-9]+}} x i8] c"{{.*}}", section ".llvmcmd", align 1
-// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo.managed to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata"
+// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo.managed to ptr), ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr)], section "llvm.metadata"
 
 __attribute__((managed)) int foo = 42;
diff --git a/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c b/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c
new file mode 100644
index 0000000000000..b6550fb1e5c77
--- /dev/null
+++ b/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -x c -triple x86_64-- -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -x c -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -x c -triple spirv64-- -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=SPIRV
+// RUN: %clang_cc1 -x c -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=SPIRV_AMD
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple x86_64-- -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-- -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=SPIRV_CL
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=SPIRV_AMD_CL
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple x86_64-- -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-- -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=SPIRV_CL
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \
+// RUN:   | FileCheck %s --check-prefix=SPIRV_AMD_CL
+
+#ifndef __OPENCL_C_VERSION__
+#define __constant const
+#endif
+
+static __constant __attribute__((__used__)) int foo = 42;
+
+
+// X86: @foo = internal constant i32 42
+// X86: @llvm.compiler.used = appending global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata"
+//
+// AMDGCN: @foo = internal addrspace(4) constant i32 42
+// AMDGCN: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @foo to ptr), ptr @bar], section "llvm.metadata"
+//
+// SPIRV: @foo = internal constant i32 42
+// SPIRV: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata"
+//
+// SPIRV_CL: @foo = internal addrspace(2) constant i32 42
+// SPIRV_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr @bar], section "llvm.metadata"
+//
+// SPIRV_AMD: @foo = internal addrspace(1) constant i32 42
+// SPIRV_AMD: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata"
+//
+// SPIRV_AMD_CL: @foo = internal addrspace(2) constant i32 42
+// SPIRV_AMD_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata"
+//
+// X86: define internal void @bar() #{{[0-9]}} {
+//
+// AMDGCN: define internal void @bar() #{{[0-9]}} {
+//
+// SPIRV: define internal spir_func void @bar() #{{[0-9]}} {
+//
+// SPIRV_CL: define internal spir_func void @bar() #{{[0-9]}} {
+//
+// SPIRV_AMD: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} {
+//
+// SPIRV_AMD_CL: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} {
+//
+static void __attribute__((__used__)) bar() {
+}

From 9c26170dd70865d253727e5122ce2a892d55800e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 3 Nov 2025 08:21:32 -0600
Subject: [PATCH 011/313] [libc] Fix Linux kernel headers being included on all
 OS's

Summary:
The changes in
https://github.com/llvm/llvm-project/commit/43bd7e3bb903af5076a9552f4f64cfc5d58f76ce
altered how we handled including headers, this included the system on
the GPU target which poisoned the include path that was curated to not
include any system headers. Change this to only apply is the target OS
is Linux.
---
 libc/CMakeLists.txt                     | 7 +++++--
 runtimes/cmake/Modules/HandleLibC.cmake | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ae555a256ba66..4e6b4195a9c5e 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -47,8 +47,6 @@ set(LIBC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 set(LIBC_ENABLE_USE_BY_CLANG OFF CACHE BOOL "Whether or not to place libc in a build directory findable by a just built clang")
 
-set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel headers")
-
 # Defining a global namespace to enclose all libc functions.
 set(default_namespace "__llvm_libc")
 if(LLVM_VERSION_MAJOR)
@@ -146,6 +144,11 @@ option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
+if(LIBC_TARGET_OS_IS_LINUX)
+  set(kernel_headers "/usr/include")
+endif()
+set(LIBC_KERNEL_HEADERS "${kernel_headers}" CACHE STRING "Path to Linux kernel headers")
+
 set(LIBC_ENABLE_UNITTESTS ON)
 set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD})
 
diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake
index 01da5b260d3d4..f8869512f99d3 100644
--- a/runtimes/cmake/Modules/HandleLibC.cmake
+++ b/runtimes/cmake/Modules/HandleLibC.cmake
@@ -30,7 +30,9 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc")
   check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG)
   if(CXX_SUPPORTS_NOSTDLIBINC_FLAG)
     target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc")
-    target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
+    if(LIBC_KERNEL_HEADERS)
+      target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
+    endif()
   endif()
 
   add_library(runtimes-libc-static INTERFACE)

From 60e53d2cc2740c74f0a64922c6868a63c688c1c5 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Mon, 3 Nov 2025 09:36:12 -0500
Subject: [PATCH 012/313] [PowerPC] Implement 32byte indexed paired ld and st
 instruction (#160767)

---
 llvm/lib/Target/PowerPC/PPCInstrFuture.td                 | 8 ++++++++
 .../MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt    | 6 ++++++
 .../Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt   | 6 ++++++
 llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s             | 8 ++++++++
 4 files changed, 28 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index da3efdc15f1e1..0c2e44e18f463 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in {
     def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp),
                                   (ins (memr $RA):$addr, g8rc:$RB),
                                   "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def LXVPB32X
+        : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp),
+                          (ins (memr $RA):$addr, g8rc:$RB),
+                          "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   let mayStore = 1 in {
@@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in {
         : XForm_XTp5_RAB5<31, 749, (outs),
                           (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
                           "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def STXVPB32X
+        : XForm_XTp5_RAB5<31, 1005, (outs),
+                          (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+                          "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB),
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index f5cb4b72959f9..2661ed5b04cc9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -82,12 +82,18 @@
 #CHECK: lxvprll 6, 2, 1
 0x7c 0xc2 0x0c 0xda
 
+#CHECK: lxvpb32x 2, 15, 16
+0x7c,0x4f,0x86,0xda
+
 #CHECK: stxvprl 0, 1, 2
 0x7c 0x01 0x15 0x9a
 
 #CHECK: stxvprll 6, 0, 1
 0x7c 0xc0 0x0d 0xda
 
+#CHECK: stxvpb32x 2, 15, 16
+0x7c,0x4f,0x87,0xda
+
 #CHECK: dmxvi8gerx4 1, 2, 4
 0xec,0x82,0x20,0x58
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index f0df8ce39021b..7fb8254ced0ac 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -76,12 +76,18 @@
 #CHECK: lxvprll 6, 2, 1
 0xda 0x0c 0xc2 0x7c
 
+#CHECK: lxvpb32x 2, 15, 16
+0xda,0x86,0x4f,0x7c
+
 #CHECK: stxvprl 0, 1, 2
 0x9a 0x15 0x01 0x7c
 
 #CHECK: stxvprll 6, 0, 1
 0xda 0x0d 0xc0 0x7c
 
+#CHECK: stxvpb32x 2, 15, 16
+0xda,0x87,0x4f,0x7c
+
 #CHECK: dmxvi8gerx4 1, 2, 4
 0x58,0x20,0x82,0xec
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index bc0683e38887c..40059c440b128 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -105,6 +105,10 @@
 # CHECK-LE: lxvprll 6, 2, 1               # encoding: [0xda,0x0c,0xc2,0x7c]
             lxvprll 6, 2, 1
 
+           lxvpb32x 2, 15, 16
+#CHECK-BE: lxvpb32x 2, 15, 16            # encoding: [0x7c,0x4f,0x86,0xda]
+#CHECK-LE: lxvpb32x 2, 15, 16            # encoding: [0xda,0x86,0x4f,0x7c]
+
 # CHECK-BE: stxvprl 0, 1, 2               # encoding: [0x7c,0x01,0x15,0x9a]
 # CHECK-LE: stxvprl 0, 1, 2               # encoding: [0x9a,0x15,0x01,0x7c]
             stxvprl 0, 1, 2
@@ -113,6 +117,10 @@
 # CHECK-LE: stxvprll 6, 0, 1              # encoding: [0xda,0x0d,0xc0,0x7c]
             stxvprll 6, 0, 1
 
+           stxvpb32x 2, 15, 16
+#CHECK-BE: stxvpb32x 2, 15, 16            # encoding: [0x7c,0x4f,0x87,0xda]
+#CHECK-LE: stxvpb32x 2, 15, 16            # encoding: [0xda,0x87,0x4f,0x7c]
+
             dmxvi8gerx4 1, 2, 4
 # CHECK-BE: dmxvi8gerx4 1, 2, 4                     # encoding: [0xec,0x82,0x20,0x58]
 # CHECK-LE: dmxvi8gerx4 1, 2, 4                     # encoding: [0x58,0x20,0x82,0xec]

From 8dcd02accfb2d0217372716edb2a4f325ddb7442 Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Mon, 3 Nov 2025 16:37:22 +0200
Subject: [PATCH 013/313] [Clang][Bytecode] Match exact void pointer deref
 error message (#166133)

Better match exactly rather than just a prefix.
---
 clang/test/AST/ByteCode/cxx11.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 427d3a106656b..e283a7b42e554 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -374,7 +374,7 @@ namespace GH150709 {
 namespace DiscardedAddrLabel {
   void foo(void) {
   L:
-    *&&L; // both-error {{indirection not permitted}} \
+    *&&L; // both-error {{indirection not permitted on operand of type 'void *'}} \
           // both-warning {{expression result unused}}
   }
 }

From 7c9f137b3ce775a5a4f9eee54700e1591a4a9f9d Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 3 Nov 2025 09:56:34 -0500
Subject: [PATCH 014/313] [ADT] Drop unused include in StringSwitch. NFC.

---
 llvm/include/llvm/ADT/StringSwitch.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 8c8d31bd4f055..5bdbb302a6d75 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -14,7 +14,6 @@
 #define LLVM_ADT_STRINGSWITCH_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstring>
@@ -64,7 +63,7 @@ class StringSwitch {
   void operator=(const StringSwitch &) = delete;
   void operator=(StringSwitch &&) = delete;
 
-  // Case-sensitive case matchers
+  // Case-sensitive case matchers.
   StringSwitch &Case(StringLiteral S, T Value) {
     CaseImpl(S, Value);
     return *this;

From 1c094a1ce2ef15b5855b11aa85dbb2f1eea54f13 Mon Sep 17 00:00:00 2001
From: Dan Blackwell <dan_blackwell@apple.com>
Date: Mon, 3 Nov 2025 14:59:49 +0000
Subject: [PATCH 015/313] [TSan][Test-Only] Account for race in
 cxa_guard_acquire.cpp test (#165853)

It is possible for "Enter potentially blocking region" to come before
"Enter constructor" in this test - if the thread that acquires the guard
fails to reach its printf before the other thread that enters the
blocking region reaches its own printf. Note that for the exit logs this
inversion is not possible.

This patch addresses this by allowing those two log lines to come in
either order.

rdar://163375661
---
 compiler-rt/test/tsan/cxa_guard_acquire.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
index fc407259e8968..6050c243cb8c1 100644
--- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp
+++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
@@ -66,10 +66,17 @@ int main(int argc, char **argv) {
   printf("Enter main\n");
 
   // If initialization is contended, the blocked thread should enter a
-  // potentially blocking region.
+  // potentially blocking region. Note that we use a DAG check because it is
+  // possible for Thread 1 to acquire the guard, then Thread 2 fail to acquire
+  // the guard then call `OnPotentiallyBlockingRegionBegin` and print "Enter
+  // potentially blocking region\n", before Thread 1 manages to reach "Enter
+  // constructor\n". This is exceptionally rare, but can be replicated by
+  // inserting a `sleep(1)` between `LazyInit() {` and `printf("Enter
+  // constructor\n");`. Due to the barrier it is not possible for the exit logs
+  // to be inverted.
   //
-  // CHECK-NEXT: Enter constructor
-  // CHECK-NEXT: Enter potentially blocking region
+  // CHECK-DAG: Enter constructor
+  // CHECK-DAG: Enter potentially blocking region
   // CHECK-NEXT: Exit constructor
   // CHECK-NEXT: Exit potentially blocking region
   barrier_init(&barrier, 2);

From 332f9b5eeef85dca29112018ba111bf64a75d27d Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Mon, 3 Nov 2025 16:09:12 +0100
Subject: [PATCH 016/313] [AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add
 support for callbr instruction with inline-asm (#152161)

Finishes adding inline-asm callbr support for AMDGPU, started by
https://github.com/llvm/llvm-project/pull/149308.
---
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp  |  89 +++---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp |  19 +-
 llvm/test/CodeGen/AMDGPU/callbr.ll            |  54 ++++
 ...nify-divergent-exit-nodes-with-musttail.ll |  51 ++++
 llvm/test/CodeGen/AMDGPU/infinite-loop.ll     | 257 ++++++++++++++++--
 .../si-annotate-nested-control-flows.ll       | 100 ++++++-
 .../si-unify-exit-multiple-unreachables.ll    | 161 ++++++++++-
 llvm/test/CodeGen/AMDGPU/update-phi.ll        |  39 +++
 llvm/test/Transforms/StructurizeCFG/callbr.ll | 235 ++++++++++++++++
 9 files changed, 926 insertions(+), 79 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll
 create mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d520fb23..706237b906cc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
   return NewRetBlock;
 }
 
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+                       SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+  BasicBlock *DummyReturnBB =
+      BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+  Type *RetTy = F.getReturnType();
+  Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+  ReturningBlocks.push_back(DummyReturnBB);
+  return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+                          BasicBlock *DummyReturnBB,
+                          std::vector<DominatorTree::UpdateType> &Updates) {
+  SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+  // Create a new transition block to hold the conditional branch.
+  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+  // 'Successors' become successors of TransitionBB instead of BB,
+  // and TransitionBB becomes a single successor of BB.
+  Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+  for (BasicBlock *Successor : Successors) {
+    Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+    Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+  }
+
+  // Create a branch that will always branch to the transition block and
+  // references DummyReturnBB.
+  BB->getTerminator()->eraseFromParent();
+  BranchInst::Create(TransitionBB, DummyReturnBB,
+                     ConstantInt::getTrue(F.getContext()), BB);
+  Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
 bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
                                             const PostDominatorTree &PDT,
                                             const UniformityInfo &UA) {
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   if (PDT.root_size() == 0 ||
       (PDT.root_size() == 1 &&
-       !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+       !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
     return false;
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,27 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       if (HasDivergentExitBlock)
         UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
-      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
-      if (DummyReturnBB == nullptr) {
-        DummyReturnBB = BasicBlock::Create(F.getContext(),
-                                           "DummyReturnBlock", &F);
-        Type *RetTy = F.getReturnType();
-        Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
-        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
-        ReturningBlocks.push_back(DummyReturnBB);
-      }
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
 
       if (BI->isUnconditional()) {
         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
-        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
-        Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-      } else { // Conditional branch.
-        SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
-        // Create a new transition block to hold the conditional branch.
-        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
-        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
-        // 'Successors' become successors of TransitionBB instead of BB,
-        // and TransitionBB becomes a single successor of BB.
-        Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
-        for (BasicBlock *Successor : Successors) {
-          Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-          Updates.emplace_back(DominatorTree::Delete, BB, Successor);
-        }
-
-        // Create a branch that will always branch to the transition block and
-        // references DummyReturnBB.
-        BB->getTerminator()->eraseFromParent();
-        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+                           ConstantInt::getTrue(F.getContext()), BB);
         Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+      } else {
+        handleNBranch(F, BB, BI, DummyReturnBB, Updates);
       }
       Changed = true;
+    } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+      handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+    } else {
+      llvm_unreachable("unsupported block terminator");
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5f6f66a4bc213..0a8f5ea2fdae1 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   } else {
     // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
-    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
-    for (BasicBlock *Succ : Term->successors())
-      if (Visited.count(Succ))
-        Loops[Succ] = BB;
+    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+      for (BasicBlock *Succ : Term->successors())
+        if (Visited.count(Succ))
+          Loops[Succ] = BB;
   }
 }
 
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
-    if (!ParentRegion->contains(P))
+    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
       continue;
 
     Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 /// Run the transformation for each region found
 bool StructurizeCFG::run(Region *R, DominatorTree *DT,
                          const TargetTransformInfo *TTI) {
-  if (R->isTopLevelRegion())
+  // CallBr and its corresponding direct target blocks are for now ignored by
+  // this pass. This is not a limitation for the currently intended uses cases
+  // of callbr in the AMDGPU backend.
+  // Parent and child regions are not affected by this (current) restriction.
+  // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+  if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
     return false;
 
   this->DT = DT;
   this->TTI = TTI;
   Func = R->getEntry()->getParent();
-  assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
   ParentRegion = R;
 
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000000000..253a6ec100eae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_dword v0, v[0:1]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %fallthrough
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %indirect
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[4:5], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+	%a = load i32, ptr %src, align 4
+	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+	store i32 %a, ptr %dst1, align 4
+	br label %ret
+indirect:
+	store i32 %a, ptr %dst2, align 4
+	br label %ret
+ret:
+	ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:  .LBB1_1: ; %callbr
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_1
+; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %callbr.target.ret
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0a6bdbc..076a99ff8588f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
 
 declare void @foo(ptr)
 declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
 
 define void @musttail_call_without_return_value(ptr %p) {
 ; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
   ret void
 }
 
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    musttail call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  musttail call void @foo(ptr %p)
+  ret void
+
+bb.1:
+  ret void
+}
+
 define i1 @musttail_call_with_return_value(ptr %p) {
 ; CHECK-LABEL: define i1 @musttail_call_with_return_value(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
 bb.1:
   ret i1 %load
 }
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  %ret = musttail call i32 @bar32(ptr %p)
+  ret i32 %ret
+
+bb.1:
+  ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43faca5aa..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
   br label %loop
 }
 
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP:%.*]] []
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", ""() to label %loop []
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+}
+
 define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_cbranch_execz .LBB2_3
 ; SI-NEXT:  ; %bb.1: ; %loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB1_2: ; %loop
+; SI-NEXT:  .LBB2_2: ; %loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
-; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_ret(
 ; IR-NEXT:  entry:
@@ -81,44 +115,93 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:  .LBB3_2: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond = icmp eq i32 %tmp, 1
+  %cond32 = zext i1 %cond to i32
+  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+
+return:
+  ret void
+}
+
 define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_cbranch_scc1 .LBB2_4
+; SI-NEXT:    s_cbranch_scc1 .LBB4_4
 ; SI-NEXT:  ; %bb.1:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x378
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB2_2: ; %loop2
+; SI-NEXT:  .LBB4_2: ; %loop2
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:    s_cbranch_vccnz .LBB4_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB2_4: ; %Flow2
+; SI-NEXT:  .LBB4_4: ; %Flow2
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_7
+; SI-NEXT:    s_cbranch_vccz .LBB4_7
 ; SI-NEXT:  ; %bb.5:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:  .LBB2_6: ; %loop1
+; SI-NEXT:  .LBB4_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_6
-; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT:    s_cbranch_vccz .LBB4_6
+; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loops(
 ; IR-NEXT:  entry:
@@ -144,24 +227,78 @@ loop2:
   br label %loop2
 }
 
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  .LBB5_2: ; Inline asm indirect target
+; SI-NEXT:    ; %loop2.preheader
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison)
+; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP1]] []
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
+; IR:       TransitionBlock1:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP2:%.*]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
+
+loop1:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop1 []
+
+loop2:
+  store volatile i32 888, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop2 []
+}
+
 define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB3_5
+; SI-NEXT:    s_cbranch_execz .LBB6_5
 ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:  .LBB3_2: ; %outer_loop
+; SI-NEXT:  .LBB6_2: ; %outer_loop
 ; SI-NEXT:    ; =>This Loop Header: Depth=1
-; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    ; Child Loop BB6_3 Depth 2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB3_3: ; %inner_loop
-; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:  .LBB6_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB6_2 Depth=1
 ; SI-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]
 ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_cbranch_execnz .LBB3_3
+; SI-NEXT:    s_cbranch_execnz .LBB6_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    s_branch .LBB3_2
-; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_branch .LBB6_2
+; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(
 ; IR-NEXT:  entry:
@@ -212,4 +349,82 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_nest_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 s[0:1], exec, 0
+; SI-NEXT:    s_branch .LBB7_3
+; SI-NEXT:  .LBB7_2: ; %loop.exit.guard
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccnz .LBB7_5
+; SI-NEXT:  .LBB7_3: ; %outer_loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], -1
+; SI-NEXT:    s_mov_b64 vcc, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB7_2
+; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_branch .LBB7_2
+; SI-NEXT:  .LBB7_5: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
+; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]])
+; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       outer_loop:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[INNER_LOOP:%.*]] []
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]])
+; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination
+  %cond1_32 = zext i1 %cond1 to i32
+  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
+
+outer_loop:
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  callbr void asm "", ""() to label %inner_loop []
+
+inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  %cond3 = icmp eq i32 %tmp, 3
+  %cond3_32 = zext i1 %cond3 to i32
+  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
+
+return:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 34de1e48bfb59..01bcdad3fc220 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,15 +3,16 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: @nested_inf_loop(
-; OPT-NEXT:  BB:
-; OPT-NEXT:    br label [[BB1:%.*]]
-; OPT:       BB1:
-; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
-; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
-; OPT:       infloop:
-; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
-; OPT:       DummyReturnBlock:
+; OPT-LABEL: define void @nested_inf_loop(
+; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    br label %[[BB1:.*]]
+; OPT:       [[BB1]]:
+; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
+; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
+; OPT:       [[INFLOOP]]:
+; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[DUMMYRETURNBLOCK]]:
 ; OPT-NEXT:    ret void
 ;
 ; ISA-LABEL: nested_inf_loop:
@@ -63,3 +64,84 @@ BB4:
 BB3:
   br label %BB1
 }
+
+define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
+; OPT-LABEL: define void @nested_inf_loop_callbr(
+; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1:.*]] []
+; OPT:       [[BB1]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]])
+; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2]
+; OPT:       [[BB2:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB4:.*]] []
+; OPT:       [[BB4]]:
+; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[TRANSITIONBLOCK]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]])
+; OPT-NEXT:            to label %[[BB3]] [label %BB4]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1]] []
+; OPT:       [[DUMMYRETURNBLOCK]]:
+; OPT-NEXT:    ret void
+;
+; ISA-LABEL: nested_inf_loop_callbr:
+; ISA:       ; %bb.0: ; %BB
+; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:  .LBB1_1: ; %BB1
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; ISA-NEXT:  .LBB1_2: ; %BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_branch .LBB1_1
+; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target
+; ISA-NEXT:    ; %BB2
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ; Label of block must be emitted
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; ISA-NEXT:    s_cbranch_execz .LBB1_5
+; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT:    s_mov_b64 s[6:7], 0
+; ISA-NEXT:    s_cbranch_vccz .LBB1_2
+; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
+; ISA-NEXT:    s_setpc_b64 s[30:31]
+BB:
+  callbr void asm "", ""() to label %BB1 []
+
+BB1:
+  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
+
+BB2:
+  callbr void asm "", ""() to label %BB4 []
+
+BB4:
+  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
+
+BB3:
+  callbr void asm "", ""() to label %BB1 []
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 4cbe682cf9f9f..004c27971131d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
 
 declare void @llvm.trap()
@@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_branch .LBB0_4
-
-
+; UNIFY-LABEL: @kernel(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    br label [[IF_END6]]
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %n, 256
@@ -105,5 +130,129 @@ if.end6.sink.split:
 if.end6:
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; UNIFY: {{.*}}
+
+define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
+; CHECK-LABEL: kernel_callbr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10
+; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %if.then
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target
+; CHECK-NEXT:    ; %UnifiedReturnBlock
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target
+; CHECK-NEXT:    ; %if.else
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.5: ; %if.then3
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_2
+; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false.i8
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    ; divergent unreachable
+; CHECK-NEXT:    s_branch .LBB1_3
+; UNIFY-LABEL: @kernel_callbr(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]])
+; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]])
+; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    callbr void asm "", ""()
+; UNIFY-NEXT:            to label [[IF_END6:%.*]] []
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cmp = icmp eq i32 %n, 256
+  %cmp32 = zext i1 %cmp to i32
+  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
+
+if.then:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp1_32 = zext i1 %cmp1 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
+
+cond.false:
+  call void @llvm.trap()
+  unreachable
+
+if.else:
+  %cmp2 = icmp ult i32 %tid, 10
+  %cmp2_32 = zext i1 %cmp2 to i32
+  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
+
+if.then3:
+  %cmp1.i7 = icmp eq i32 %a, 0
+  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
+
+cond.false.i8:
+  call void @llvm.trap()
+  unreachable
+
+if.end6.sink.split:
+  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+  store i32 %a, ptr addrspace(1) %x1, align 4
+  callbr void asm "", ""() to label %if.end6 []
+
+if.end6:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 50666bee325e8..684dc1a1f0092 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,3 +37,42 @@ n28:                                               ; preds = %.loopexit, %n28
 n31:                                               ; preds =
   ret void
 }
+
+define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
+; IR-LABEL: @_amdgpu_ps_main_callbr(
+; IR-NEXT:  .entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] []
+; IR:       .loopexit:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[N28:%.*]] []
+; IR:       n28:
+; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
+; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
+; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
+; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]])
+; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28]
+; IR:       n31:
+; IR-NEXT:    ret void
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+.entry:
+  callbr void asm "", ""() to label %.loopexit []
+
+.loopexit:                                        ; preds = %n28, %.entry
+  callbr void asm "", ""() to label %n28 []
+
+n28:                                               ; preds = %.loopexit, %n28
+  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
+  %n29 = fadd float %.01, 1.0
+  %n30 = fcmp ogt float %n29, 4.000000e+00
+  %n30.32 = zext i1 %n30 to i32
+  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
+
+n31:                                               ; preds =
+  ret void
+}
diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll
new file mode 100644
index 0000000000000..42f95194980d4
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
+
+; Structurize as usual, but don't tear callbr and its destination blocks apart.
+;
+; Note: currently, callbr blocks and their corresponding target blocks
+; themselves are not handled by the structurizer.* If the CFG turns out to be
+; unstructured at the end, the CFG lowering (si-annotate-control-flow) will
+; detect this. For the currently intended use cases of callbr in the context of
+; the AMDGPU backend, this is not a limitation (cf.
+; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087).
+;
+; Note 2: while callbr and its targets remain untouched, everything else is
+; handled as usual, even if it is nested in a callbr region.
+;
+; *FIXME: this will be fixed in the future. Callbr can be handled as follows:
+; Input IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...]
+; fallthrough:
+;   br label %exit
+; indirect:
+;   br label %exit
+; ...
+; exit:
+;   ret void
+; }
+; ```
+;
+; Output IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"()
+;          to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...]
+; fake.indirect:                                    ; preds = %0
+;   br label %Flow
+; fake.indirect1:                                   ; preds = %0
+;   br label %Flow
+; fake.indirect2:                                   ; preds = %0
+;   br label %Flow
+; ...
+; Flow:                                             ; preds = %fallthrough, %fake.indirect[0-N]
+;   %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ]
+;   br i1 %1, label %indirect, label %Flow1
+; Flow1:                                            ; preds = %Flow, %indirect
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ]
+;   br i1 %2, label %indirect1, label %Flow2
+; Flow2:                                            ; preds = %Flow, %indirect1
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ]
+;   br i1 %2, label %indirect2, label %Flow3
+; ...
+; fallthrough:                                      ; preds = %0
+;   br label %Flow
+; indirect:                                         ; preds = %Flow
+;   br label %Flow1
+; indirect1:                                        ; preds = %Flow1
+;   br label %Flow2
+; indirect2:                                        : preds = %Flow2
+;   br label %Flow3
+; ...
+; exit:                                             ; preds = %indirectN, %FlowN
+;   ret void
+; }
+; ```
+;
+; Output IR as ASCII-art:
+;          %0
+; ---------------------
+; |     |     |     |
+; v     v     v     v
+; f    f.i   f.i1  f.i2
+; |     |     |     |
+; v     v     v     v
+; ---------------------
+;        %Flow
+;          |   \
+;          |    %indirect
+;          |   /
+;       %Flow1
+;          |   \
+;          |    %indirect1
+;          |   /
+;       %Flow2
+;          |   \
+;          |    %indirect2
+;          |   /
+;        %exit
+;
+
+; Only callbr, nothing to do.
+define void @callbr_simple() {
+; CHECK-LABEL: define void @callbr_simple() {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr nested in non-callbr: non-callbr is transformed
+define void @callbr_in_non_callbr(i1 %c) {
+; CHECK-LABEL: define void @callbr_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+nocallbr:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr parent of non-callbr: non-callbr is transformed
+define void @non_callbr_in_callbr(i1 %c) {
+; CHECK-LABEL: define void @non_callbr_in_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[FALLTHROUGH2]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %c, label %fallthrough1, label %fallthrough2
+fallthrough1:
+  br label %exit
+fallthrough2:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr surrounded by non-callbr: all three regular branches are handled
+; correctly
+define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) {
+; CHECK-LABEL: define void @callbr_nested_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]]
+; CHECK:       [[FLOW3]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW2]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[INDIRECT1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]]
+; CHECK:       [[NOCALLBR1]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    br label %[[FLOW3]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[FLOW2]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %d, label %fallthrough1, label %ret
+fallthrough1:
+  br label %ret
+indirect:
+  br i1 %e, label %indirect1, label %ret
+indirect1:
+  br label %ret
+nocallbr:
+  br i1 %f, label %nocallbr1, label %ret
+nocallbr1:
+  br label %ret
+ret:
+  ret void
+}

From 4f618636ddb4938bb91816b66250edc755cdc7d1 Mon Sep 17 00:00:00 2001
From: SKill <skill@google.com>
Date: Mon, 3 Nov 2025 16:12:15 +0100
Subject: [PATCH 017/313] [clang] Optimize SourceManager.getSpellingLocSlowCase
 and SourceManager.getFileLocSlowCase (#164269)

Optimize implementations of `getSpellingLocSlowCase` and
`getFileLocSlowCase` by inlining called methods to avoid repeated calls
to `getSLocEntry` and `getFileID`.

a performance improvement follow up for #160667
---
 clang/lib/Basic/SourceManager.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 938c6485125ee..97aa0f2aa59b9 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -907,19 +907,23 @@ getExpansionLocSlowCase(SourceLocation Loc) const {
 
 SourceLocation SourceManager::getSpellingLocSlowCase(SourceLocation Loc) const {
   do {
-    FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
-    Loc = getSLocEntry(LocInfo.first).getExpansion().getSpellingLoc();
-    Loc = Loc.getLocWithOffset(LocInfo.second);
+    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
+    Loc = Entry.getExpansion().getSpellingLoc().getLocWithOffset(
+        Loc.getOffset() - Entry.getOffset());
   } while (!Loc.isFileID());
   return Loc;
 }
 
 SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const {
   do {
-    if (isMacroArgExpansion(Loc))
-      Loc = getImmediateSpellingLoc(Loc);
-    else
-      Loc = getImmediateExpansionRange(Loc).getBegin();
+    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
+    const ExpansionInfo &ExpInfo = Entry.getExpansion();
+    if (ExpInfo.isMacroArgExpansion()) {
+      Loc = ExpInfo.getSpellingLoc().getLocWithOffset(Loc.getOffset() -
+                                                      Entry.getOffset());
+    } else {
+      Loc = ExpInfo.getExpansionLocStart();
+    }
   } while (!Loc.isFileID());
   return Loc;
 }

From 5a11eea58193ed30f1a5a126d33d2627026c1451 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 3 Nov 2025 19:18:41 +0400
Subject: [PATCH 018/313] [clang] Add sections to C++ DR status page (#165749)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Following [recent
discussion](http://lists.isocpp.org/core/2025/10/18637.php) on CWG
reflector, `cwg_index.html` now contains stable name of the section for
each Core issue. I thought that this would be a useful information to
have on our C++ DR status page for that handful of experts who actually
open it, so here we are.

This PR consists of 3 parts:
1. Rewrite of a small routine that parses `cwg_index.html` from splits
and indicies to a single regular expression with named groups, adding
section information.
2. Changes to the rest of the `make_cxx_dr_status` to accommodate for
the first part.
3. Regenerated `cxx_dr_status.html`.

Ideally this PR would only add lines to `cxx_dr_status.html`, but
previously we've been leaving some newlines in issue titles (which never
affected how this page is rendered), which are now properly replaced
with whitespaces — hence a couple of deletions in that file.

---------

Co-authored-by: Richard Smith <richard@metafoo.co.uk>
---
 clang/www/cxx_dr_status.html | 3077 +++++++++++++++++++++++++++++++++-
 clang/www/make_cxx_dr_status |   72 +-
 2 files changed, 3109 insertions(+), 40 deletions(-)

diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 0312c9dfc0665..e9fadb2dbd4ac 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -40,2843 +40,3314 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
 
 <p>This page tracks which C++ defect reports are implemented within Clang.</p>
 
-<table width="689" border="1" cellspacing="0">
+<table width="892" border="1" cellspacing="0">
   <tr>
     <th>Number</th>
+    <th>Section</th>
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
   </tr>
   <tr id="1">
     <td><a href="https://cplusplus.github.io/CWG/issues/1.html">1</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>What if two using-declarations refer to the same function but the declarations introduce different default-arguments?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2">
     <td><a href="https://cplusplus.github.io/CWG/issues/2.html">2</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>drafting</td>
     <td>How can dependent names be used in member declarations that appear outside of the class template definition?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3">
     <td><a href="https://cplusplus.github.io/CWG/issues/3.html">3</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>The template compilation model rules render some explicit specialization declarations not visible during instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="4">
     <td><a href="https://cplusplus.github.io/CWG/issues/4.html">4</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD1</td>
     <td>Does extern "C" affect the linkage of function names with internal linkage?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="5">
     <td><a href="https://cplusplus.github.io/CWG/issues/5.html">5</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>CV-qualifiers and type conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="6">
     <td><a href="https://cplusplus.github.io/CWG/issues/6.html">6</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>NAD</td>
     <td>Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument?</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="7">
     <td><a href="https://cplusplus.github.io/CWG/issues/7.html">7</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Can a class with a private virtual base class be derived from?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="8">
     <td><a href="https://cplusplus.github.io/CWG/issues/8.html">8</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD1</td>
     <td>Access to template arguments used in a function return type and in the nested name specifier</td>
     <td class="full" align="center">Duplicate of <a href="#45">45</a></td>
   </tr>
   <tr id="9">
     <td><a href="https://cplusplus.github.io/CWG/issues/9.html">9</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td>Clarification of access to base class members</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="10">
     <td><a href="https://cplusplus.github.io/CWG/issues/10.html">10</a></td>
+    <td>[<a href="https://wg21.link/class.access.nest">class.access.nest</a>]</td>
     <td>CD1</td>
     <td>Can a nested class access its own class name as a qualified name if it is a private member of the enclosing class?</td>
     <td class="full" align="center">Duplicate of <a href="#45">45</a></td>
   </tr>
   <tr id="11">
     <td><a href="https://cplusplus.github.io/CWG/issues/11.html">11</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td>How do the keywords typename/template interact with using-declarations?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="12">
     <td><a href="https://cplusplus.github.io/CWG/issues/12.html">12</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>dup</td>
     <td>Default arguments on different declarations for the same function and the Koenig lookup</td>
     <td class="full-superseded" align="center">Superseded by <a href="#239">239</a></td>
   </tr>
   <tr id="13">
     <td><a href="https://cplusplus.github.io/CWG/issues/13.html">13</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>extern "C" for Parameters of Function Templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="14">
     <td><a href="https://cplusplus.github.io/CWG/issues/14.html">14</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>extern "C" functions and declarations in different namespaces</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="15">
     <td><a href="https://cplusplus.github.io/CWG/issues/15.html">15</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>dup</td>
     <td>Default arguments for parameters of function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="16">
     <td><a href="https://cplusplus.github.io/CWG/issues/16.html">16</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td>Access to members of indirect private base classes</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="17">
     <td><a href="https://cplusplus.github.io/CWG/issues/17.html">17</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Footnote 99 should discuss the naming class when describing members that can be accessed from friends</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="18">
     <td><a href="https://cplusplus.github.io/CWG/issues/18.html">18</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>NAD</td>
     <td>f(TYPE) where TYPE is void should be allowed</td>
     <td class="full-superseded" align="center">Superseded by <a href="#577">577</a></td>
   </tr>
   <tr id="19">
     <td><a href="https://cplusplus.github.io/CWG/issues/19.html">19</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>NAD</td>
     <td>Clarify protected member access</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="20">
     <td><a href="https://cplusplus.github.io/CWG/issues/20.html">20</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>TC1</td>
     <td>Some clarifications needed for 12.8 para 15</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="21">
     <td><a href="https://cplusplus.github.io/CWG/issues/21.html">21</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Can a default argument for a template parameter appear in a friend declaration?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="22">
     <td><a href="https://cplusplus.github.io/CWG/issues/22.html">22</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>TC1</td>
     <td>Template parameter with a default argument that refers to itself</td>
     <td class="full-superseded" align="center">Superseded by <a href="#481">481</a></td>
   </tr>
   <tr id="23">
     <td><a href="https://cplusplus.github.io/CWG/issues/23.html">23</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>NAD</td>
     <td>Some questions regarding partial ordering of function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="24">
     <td><a href="https://cplusplus.github.io/CWG/issues/24.html">24</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>TC1</td>
     <td>Errors in examples in 14.7.3</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="25">
     <td><a href="https://cplusplus.github.io/CWG/issues/25.html">25</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>TC1</td>
     <td>Exception specifications and pointers to members</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="26">
     <td><a href="https://cplusplus.github.io/CWG/issues/26.html">26</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Copy constructors and default arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="27">
     <td><a href="https://cplusplus.github.io/CWG/issues/27.html">27</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>NAD</td>
     <td>Overload ambiguities for builtin ?: prototypes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="28">
     <td><a href="https://cplusplus.github.io/CWG/issues/28.html">28</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD1</td>
     <td>'exit', 'signal' and static object destruction</td>
     <td class="na" align="center">N/A (Library DR)</td>
   </tr>
   <tr id="29">
     <td><a href="https://cplusplus.github.io/CWG/issues/29.html">29</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD1</td>
     <td>Linkage of locally declared functions</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="30">
     <td><a href="https://cplusplus.github.io/CWG/issues/30.html">30</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>TC1</td>
     <td>Valid uses of "<TT>::template</TT>"</td>
     <td class="full-superseded" align="center">Superseded by <a href="#468">468</a> (C++11 onwards)</td>
   </tr>
   <tr id="31">
     <td><a href="https://cplusplus.github.io/CWG/issues/31.html">31</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Looking up new/delete</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="32">
     <td><a href="https://cplusplus.github.io/CWG/issues/32.html">32</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Clarification of explicit instantiation of non-exported templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="33">
     <td><a href="https://cplusplus.github.io/CWG/issues/33.html">33</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Argument dependent lookup and overloaded functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="34">
     <td><a href="https://cplusplus.github.io/CWG/issues/34.html">34</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Argument dependent lookup and points of instantiation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="35">
     <td><a href="https://cplusplus.github.io/CWG/issues/35.html">35</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Definition of default-initialization</td>
     <td class="full" align="center">Duplicate of <a href="#178">178</a></td>
   </tr>
   <tr id="36">
     <td><a href="https://cplusplus.github.io/CWG/issues/36.html">36</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s in multiple-declaration contexts</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="37">
     <td><a href="https://cplusplus.github.io/CWG/issues/37.html">37</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>NAD</td>
     <td>When is uncaught_exception() true?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#475">475</a></td>
   </tr>
   <tr id="38">
     <td><a href="https://cplusplus.github.io/CWG/issues/38.html">38</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>TC1</td>
     <td>Explicit template arguments and operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="39">
     <td><a href="https://cplusplus.github.io/CWG/issues/39.html">39</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD1</td>
     <td>Conflicting ambiguity rules</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="40">
     <td><a href="https://cplusplus.github.io/CWG/issues/40.html">40</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>TC1</td>
     <td>Syntax of <I>declarator-id</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="41">
     <td><a href="https://cplusplus.github.io/CWG/issues/41.html">41</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>TC1</td>
     <td>Clarification of lookup of names after declarator-id</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="42">
     <td><a href="https://cplusplus.github.io/CWG/issues/42.html">42</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>NAD</td>
     <td>Redefining names from base classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="43">
     <td><a href="https://cplusplus.github.io/CWG/issues/43.html">43</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>TC1</td>
     <td>Copying base classes (PODs) using memcpy</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="44">
     <td><a href="https://cplusplus.github.io/CWG/issues/44.html">44</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Member specializations</td>
     <td class="partial-superseded" align="center">Superseded by <a href="#727">727</a></td>
   </tr>
   <tr id="45">
     <td><a href="https://cplusplus.github.io/CWG/issues/45.html">45</a></td>
+    <td>[<a href="https://wg21.link/class.access.nest">class.access.nest</a>]</td>
     <td>CD1</td>
     <td>Access to nested classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="46">
     <td><a href="https://cplusplus.github.io/CWG/issues/46.html">46</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicit instantiation of member templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="47">
     <td><a href="https://cplusplus.github.io/CWG/issues/47.html">47</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td>Template friend issues</td>
     <td class="full-superseded" align="center">Superseded by <a href="#329">329</a></td>
   </tr>
   <tr id="48">
     <td><a href="https://cplusplus.github.io/CWG/issues/48.html">48</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>TC1</td>
     <td>Definitions of unused static members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="49">
     <td><a href="https://cplusplus.github.io/CWG/issues/49.html">49</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Restriction on non-type, non-value template arguments</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="50">
     <td><a href="https://cplusplus.github.io/CWG/issues/50.html">50</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Converting pointer to incomplete type to same type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="51">
     <td><a href="https://cplusplus.github.io/CWG/issues/51.html">51</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>TC1</td>
     <td>Overloading and user-defined conversions</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="52">
     <td><a href="https://cplusplus.github.io/CWG/issues/52.html">52</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>TC1</td>
     <td>Non-static members, member selection and access checking</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="53">
     <td><a href="https://cplusplus.github.io/CWG/issues/53.html">53</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td>Lvalue-to-rvalue conversion before certain static_casts</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="54">
     <td><a href="https://cplusplus.github.io/CWG/issues/54.html">54</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Static_cast from private base to derived class</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="55">
     <td><a href="https://cplusplus.github.io/CWG/issues/55.html">55</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Adding/subtracting pointer and enumeration value</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="56">
     <td><a href="https://cplusplus.github.io/CWG/issues/56.html">56</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>TC1</td>
     <td>Redeclaring typedefs within classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="57">
     <td><a href="https://cplusplus.github.io/CWG/issues/57.html">57</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>open</td>
     <td>Empty unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="58">
     <td><a href="https://cplusplus.github.io/CWG/issues/58.html">58</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD1</td>
     <td>Signedness of bit fields of enum type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="59">
     <td><a href="https://cplusplus.github.io/CWG/issues/59.html">59</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>TC1</td>
     <td>Clarification of overloading and UDC to reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="60">
     <td><a href="https://cplusplus.github.io/CWG/issues/60.html">60</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD1</td>
     <td>Reference binding and valid conversion sequences</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="61">
     <td><a href="https://cplusplus.github.io/CWG/issues/61.html">61</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>NAD</td>
     <td>Address of static member function "<TT>&amp;p-&gt;f</TT>"</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="62">
     <td><a href="https://cplusplus.github.io/CWG/issues/62.html">62</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.type">temp.arg.type</a>]</td>
     <td>CD1</td>
     <td>Unnamed members of classes used as type parameters</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="63">
     <td><a href="https://cplusplus.github.io/CWG/issues/63.html">63</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD1</td>
     <td>Class instantiation from pointer conversion to void*, null and self</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="64">
     <td><a href="https://cplusplus.github.io/CWG/issues/64.html">64</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>TC1</td>
     <td>Partial ordering to disambiguate explicit specialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="65">
     <td><a href="https://cplusplus.github.io/CWG/issues/65.html">65</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>Typo in default argument example</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="66">
     <td><a href="https://cplusplus.github.io/CWG/issues/66.html">66</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>NAD</td>
     <td>Visibility of default args vs overloads added after using-declaration</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="67">
     <td><a href="https://cplusplus.github.io/CWG/issues/67.html">67</a></td>
+    <td>[<a href="https://wg21.link/class.static">class.static</a>]</td>
     <td>TC1</td>
     <td>Evaluation of left side of object-expression</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="68">
     <td><a href="https://cplusplus.github.io/CWG/issues/68.html">68</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>TC1</td>
     <td>Grammar does not allow "friend class A&lt;int&gt;;"</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="69">
     <td><a href="https://cplusplus.github.io/CWG/issues/69.html">69</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>TC1</td>
     <td>Storage class specifiers on template declarations</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="70">
     <td><a href="https://cplusplus.github.io/CWG/issues/70.html">70</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>Is an array bound a nondeduced context?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="71">
     <td><a href="https://cplusplus.github.io/CWG/issues/71.html">71</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>NAD</td>
     <td>Incorrect cross reference</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="72">
     <td><a href="https://cplusplus.github.io/CWG/issues/72.html">72</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>dup</td>
     <td>Linkage and storage class specifiers for templates</td>
     <td class="full" align="center">Duplicate of <a href="#69">69</a></td>
   </tr>
   <tr id="73">
     <td><a href="https://cplusplus.github.io/CWG/issues/73.html">73</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>TC1</td>
     <td>Pointer equality</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1652">1652</a></td>
   </tr>
   <tr id="74">
     <td><a href="https://cplusplus.github.io/CWG/issues/74.html">74</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>TC1</td>
     <td>Enumeration value in direct-new-declarator</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="75">
     <td><a href="https://cplusplus.github.io/CWG/issues/75.html">75</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>In-class initialized members must be const</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="76">
     <td><a href="https://cplusplus.github.io/CWG/issues/76.html">76</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>TC1</td>
     <td>Are const volatile variables considered "constant expressions"?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="77">
     <td><a href="https://cplusplus.github.io/CWG/issues/77.html">77</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD1</td>
     <td>The definition of friend does not allow nested classes to be friends</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="78">
     <td><a href="https://cplusplus.github.io/CWG/issues/78.html">78</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Section 8.5 paragraph 9 should state it only applies to non-static objects</td>
     <td class="none" align="center">Superseded by <a href="#????">????</a></td>
   </tr>
   <tr id="79">
     <td><a href="https://cplusplus.github.io/CWG/issues/79.html">79</a></td>
+    <td>[<a href="https://wg21.link/new.delete.placement">new.delete.placement</a>]</td>
     <td>dup</td>
     <td>Alignment and placement new</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="80">
     <td><a href="https://cplusplus.github.io/CWG/issues/80.html">80</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>Class members with same name as class</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="81">
     <td><a href="https://cplusplus.github.io/CWG/issues/81.html">81</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>NAD</td>
     <td>Null pointers and C compatibility</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="82">
     <td><a href="https://cplusplus.github.io/CWG/issues/82.html">82</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>dup</td>
     <td>Definition of "using" a constant expression</td>
     <td class="full" align="center">Duplicate of <a href="#48">48</a></td>
   </tr>
   <tr id="83">
     <td><a href="https://cplusplus.github.io/CWG/issues/83.html">83</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>TC1</td>
     <td>Overloading and deprecated conversion of string literal</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="84">
     <td><a href="https://cplusplus.github.io/CWG/issues/84.html">84</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>TC1</td>
     <td>Overloading and conversion loophole used by <TT>auto_ptr</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="85">
     <td><a href="https://cplusplus.github.io/CWG/issues/85.html">85</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>TC1</td>
     <td>Redeclaration of member class</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="86">
     <td><a href="https://cplusplus.github.io/CWG/issues/86.html">86</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Lifetime of temporaries in query expressions</td>
     <td class="full" align="center">Duplicate of <a href="#446">446</a></td>
   </tr>
   <tr id="87">
     <td><a href="https://cplusplus.github.io/CWG/issues/87.html">87</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD1</td>
     <td>Exception specifications on function parameters</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="88">
     <td><a href="https://cplusplus.github.io/CWG/issues/88.html">88</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Specialization of member constant templates</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="89">
     <td><a href="https://cplusplus.github.io/CWG/issues/89.html">89</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>TC1</td>
     <td>Object lifetime does not account for reference rebinding</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="90">
     <td><a href="https://cplusplus.github.io/CWG/issues/90.html">90</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Should the enclosing class be an "associated class" too?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="91">
     <td><a href="https://cplusplus.github.io/CWG/issues/91.html">91</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>A union's associated types should include the union itself</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="92">
     <td><a href="https://cplusplus.github.io/CWG/issues/92.html">92</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Should <I>exception-specification</I>s be part of the type system?</td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr id="93">
     <td><a href="https://cplusplus.github.io/CWG/issues/93.html">93</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>TC1</td>
     <td>Missing word in 3.8 <U>basic.life</U> paragraph 2</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="94">
     <td><a href="https://cplusplus.github.io/CWG/issues/94.html">94</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>TC1</td>
     <td>Inconsistencies in the descriptions of constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="95">
     <td><a href="https://cplusplus.github.io/CWG/issues/95.html">95</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Elaborated type specifiers referencing names declared in friend decls</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="96">
     <td><a href="https://cplusplus.github.io/CWG/issues/96.html">96</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++11</td>
     <td>Syntactic disambiguation using the <TT>template</TT> keyword</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1787">P1787</a></td>
   </tr>
   <tr id="97">
     <td><a href="https://cplusplus.github.io/CWG/issues/97.html">97</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Use of bool constants in integral constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="98">
     <td><a href="https://cplusplus.github.io/CWG/issues/98.html">98</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>TC1</td>
     <td>Branching into try block</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="99">
     <td><a href="https://cplusplus.github.io/CWG/issues/99.html">99</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Partial ordering, references and cv-qualifiers</td>
     <td class="full-superseded" align="center">Superseded by <a href="#214">214</a></td>
   </tr>
   <tr id="100">
     <td><a href="https://cplusplus.github.io/CWG/issues/100.html">100</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>TC1</td>
     <td>Clarify why string literals are not allowed as template arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="101">
     <td><a href="https://cplusplus.github.io/CWG/issues/101.html">101</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>TC1</td>
     <td>Redeclaration of extern "C" names via using-declarations</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="102">
     <td><a href="https://cplusplus.github.io/CWG/issues/102.html">102</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>NAD</td>
     <td>Operator lookup rules do not work well with parts of the library</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="103">
     <td><a href="https://cplusplus.github.io/CWG/issues/103.html">103</a></td>
+    <td>[<a href="https://wg21.link/namespace.udir">namespace.udir</a>]</td>
     <td>TC1</td>
     <td>Is it <I>extended-namespace-definition</I> or <I>extension-namespace-definition</I> ?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="104">
     <td><a href="https://cplusplus.github.io/CWG/issues/104.html">104</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>NAD</td>
     <td>Destroying the exception temp when no handler is found</td>
     <td class="na" align="center">N/A (Library DR)</td>
   </tr>
   <tr id="105">
     <td><a href="https://cplusplus.github.io/CWG/issues/105.html">105</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Meaning of "template function"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="106">
     <td><a href="https://cplusplus.github.io/CWG/issues/106.html">106</a></td>
+    <td>[<a href="https://wg21.link/unknown">unknown</a>]</td>
     <td>CD1</td>
     <td>Creating references to references during template deduction/instantiation</td>
     <td class="full-superseded" align="center">Superseded by <a href="#540">540</a></td>
   </tr>
   <tr id="107">
     <td><a href="https://cplusplus.github.io/CWG/issues/107.html">107</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Linkage of operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="108">
     <td><a href="https://cplusplus.github.io/CWG/issues/108.html">108</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>TC1</td>
     <td>Are classes nested in templates dependent?</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="109">
     <td><a href="https://cplusplus.github.io/CWG/issues/109.html">109</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td>Allowing <TT>::template</TT> in <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="110">
     <td><a href="https://cplusplus.github.io/CWG/issues/110.html">110</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD6</td>
     <td>Can template functions and classes be declared in the same scope?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="111">
     <td><a href="https://cplusplus.github.io/CWG/issues/111.html">111</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Copy constructors and cv-qualifiers</td>
     <td class="full" align="center">Duplicate of <a href="#535">535</a></td>
   </tr>
   <tr id="112">
     <td><a href="https://cplusplus.github.io/CWG/issues/112.html">112</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD1</td>
     <td>Array types and cv-qualifiers</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="113">
     <td><a href="https://cplusplus.github.io/CWG/issues/113.html">113</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Visibility of called function</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="114">
     <td><a href="https://cplusplus.github.io/CWG/issues/114.html">114</a></td>
+    <td>[<a href="https://wg21.link/temp.mem">temp.mem</a>]</td>
     <td>NAD</td>
     <td>Virtual overriding by template member function specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="115">
     <td><a href="https://cplusplus.github.io/CWG/issues/115.html">115</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD1</td>
     <td>Address of template-id</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="116">
     <td><a href="https://cplusplus.github.io/CWG/issues/116.html">116</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>TC1</td>
     <td>Equivalent and functionally-equivalent function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="117">
     <td><a href="https://cplusplus.github.io/CWG/issues/117.html">117</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Timing of destruction of temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="118">
     <td><a href="https://cplusplus.github.io/CWG/issues/118.html">118</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Calls via pointers to virtual member functions</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="119">
     <td><a href="https://cplusplus.github.io/CWG/issues/119.html">119</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Object lifetime and aggregate initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="120">
     <td><a href="https://cplusplus.github.io/CWG/issues/120.html">120</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td>Nonexistent non-terminal <I>qualified-name</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="121">
     <td><a href="https://cplusplus.github.io/CWG/issues/121.html">121</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td>Dependent type names with non-dependent <I>nested-name-specifier</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="122">
     <td><a href="https://cplusplus.github.io/CWG/issues/122.html">122</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD1</td>
     <td><I>template-id</I>s as <I>unqualified-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="123">
     <td><a href="https://cplusplus.github.io/CWG/issues/123.html">123</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>TC1</td>
     <td>Bad cross-reference</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="124">
     <td><a href="https://cplusplus.github.io/CWG/issues/124.html">124</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Lifetime of temporaries in default initialization of class arrays</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="125">
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD1</td>
     <td>Ambiguity in <TT>friend</TT> declaration syntax</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="126">
     <td><a href="https://cplusplus.github.io/CWG/issues/126.html">126</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>TC1</td>
     <td>Exception specifications and <TT>const</TT></td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="127">
     <td><a href="https://cplusplus.github.io/CWG/issues/127.html">127</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>TC1</td>
     <td>Ambiguity in description of matching deallocation function</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="128">
     <td><a href="https://cplusplus.github.io/CWG/issues/128.html">128</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td>Casting between enum types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="129">
     <td><a href="https://cplusplus.github.io/CWG/issues/129.html">129</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD3</td>
     <td>Stability of uninitialized auto variables</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="130">
     <td><a href="https://cplusplus.github.io/CWG/issues/130.html">130</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Sequence points and <I>new-expression</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="131">
     <td><a href="https://cplusplus.github.io/CWG/issues/131.html">131</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>TC1</td>
     <td>Typo in Lao characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="132">
     <td><a href="https://cplusplus.github.io/CWG/issues/132.html">132</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>Local types and linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="133">
     <td><a href="https://cplusplus.github.io/CWG/issues/133.html">133</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications and checking</td>
     <td class="none" align="center">Duplicate of <a href="#87">87</a></td>
   </tr>
   <tr id="134">
     <td><a href="https://cplusplus.github.io/CWG/issues/134.html">134</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Template classes and <I>declarator-id</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="135">
     <td><a href="https://cplusplus.github.io/CWG/issues/135.html">135</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>TC1</td>
     <td>Class type in in-class member function definitions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="136">
     <td><a href="https://cplusplus.github.io/CWG/issues/136.html">136</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD1</td>
     <td>Default arguments and friend declarations</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="137">
     <td><a href="https://cplusplus.github.io/CWG/issues/137.html">137</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td><TT>static_cast</TT> of <I>cv</I> <TT>void*</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="138">
     <td><a href="https://cplusplus.github.io/CWG/issues/138.html">138</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD6</td>
     <td>Friend declaration name lookup</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="139">
     <td><a href="https://cplusplus.github.io/CWG/issues/139.html">139</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD1</td>
     <td>Error in <TT>friend</TT> lookup example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="140">
     <td><a href="https://cplusplus.github.io/CWG/issues/140.html">140</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Agreement of parameter declarations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="141">
     <td><a href="https://cplusplus.github.io/CWG/issues/141.html">141</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Non-member function templates in member access expressions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="142">
     <td><a href="https://cplusplus.github.io/CWG/issues/142.html">142</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>TC1</td>
     <td>Injection-related errors in access example</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="143">
     <td><a href="https://cplusplus.github.io/CWG/issues/143.html">143</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Friends and Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="144">
     <td><a href="https://cplusplus.github.io/CWG/issues/144.html">144</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>open</td>
     <td>Position of <TT>friend</TT> specifier</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="145">
     <td><a href="https://cplusplus.github.io/CWG/issues/145.html">145</a></td>
+    <td>[<a href="https://wg21.link/depr.impldec">depr.impldec</a>]</td>
     <td>TC1</td>
     <td>Deprecation of prefix <TT>++</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="146">
     <td><a href="https://cplusplus.github.io/CWG/issues/146.html">146</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>Floating-point zero</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="147">
     <td><a href="https://cplusplus.github.io/CWG/issues/147.html">147</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>TC1</td>
     <td>Naming the constructor</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="148">
     <td><a href="https://cplusplus.github.io/CWG/issues/148.html">148</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>TC1</td>
     <td>POD classes and pointers to members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="149">
     <td><a href="https://cplusplus.github.io/CWG/issues/149.html">149</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>TC1</td>
     <td>Accessibility and ambiguity</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="150">
     <td><a href="https://cplusplus.github.io/CWG/issues/150.html">150</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>C++17</td>
     <td>Template template parameters and default arguments</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="151">
     <td><a href="https://cplusplus.github.io/CWG/issues/151.html">151</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Terminology of zero-initialization</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="152">
     <td><a href="https://cplusplus.github.io/CWG/issues/152.html">152</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>TC1</td>
     <td><TT>explicit</TT> copy constructors</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="153">
     <td><a href="https://cplusplus.github.io/CWG/issues/153.html">153</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>TC1</td>
     <td>Misleading wording (rank of conversion)</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="154">
     <td><a href="https://cplusplus.github.io/CWG/issues/154.html">154</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>NAD</td>
     <td>Anonymous unions in unnamed namespaces</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="155">
     <td><a href="https://cplusplus.github.io/CWG/issues/155.html">155</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>dup</td>
     <td>Brace initializer for scalar</td>
     <td class="full" align="center">Duplicate of <a href="#632">632</a></td>
   </tr>
   <tr id="156">
     <td><a href="https://cplusplus.github.io/CWG/issues/156.html">156</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>NAD</td>
     <td>Name lookup for conversion functions</td>
     <td class="partial-superseded" align="center">Superseded by <a href="#1111">1111</a></td>
   </tr>
   <tr class="open" id="157">
     <td><a href="https://cplusplus.github.io/CWG/issues/157.html">157</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>open</td>
     <td>Omitted typedef declarator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="158">
     <td><a href="https://cplusplus.github.io/CWG/issues/158.html">158</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD1</td>
     <td>Aliasing and qualification conversions</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="159">
     <td><a href="https://cplusplus.github.io/CWG/issues/159.html">159</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>TC1</td>
     <td>Namespace qualification in declarators</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="160">
     <td><a href="https://cplusplus.github.io/CWG/issues/160.html">160</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>CD1</td>
     <td>Missing <TT>std::</TT> qualification</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="161">
     <td><a href="https://cplusplus.github.io/CWG/issues/161.html">161</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>TC1</td>
     <td>Access to protected nested type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="162">
     <td><a href="https://cplusplus.github.io/CWG/issues/162.html">162</a></td>
+    <td>[<a href="https://wg21.link/over.match.call">over.match.call</a>]</td>
     <td>CD1</td>
     <td>(<TT>&amp;C::f)()</TT> with nonstatic members</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="163">
     <td><a href="https://cplusplus.github.io/CWG/issues/163.html">163</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>TC1</td>
     <td>Description of subaggregate initializer</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="164">
     <td><a href="https://cplusplus.github.io/CWG/issues/164.html">164</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Overlap between Koenig and normal lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="165">
     <td><a href="https://cplusplus.github.io/CWG/issues/165.html">165</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Definitions of friends and block-scope externs</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="166">
     <td><a href="https://cplusplus.github.io/CWG/issues/166.html">166</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>TC1</td>
     <td>Friend declarations of <I>template-id</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="167">
     <td><a href="https://cplusplus.github.io/CWG/issues/167.html">167</a></td>
+    <td>[<a href="https://wg21.link/depr.static">depr.static</a>]</td>
     <td>NAD</td>
     <td>Deprecating static functions</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1012">1012</a></td>
   </tr>
   <tr id="168">
     <td><a href="https://cplusplus.github.io/CWG/issues/168.html">168</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>C linkage for static member functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="169">
     <td><a href="https://cplusplus.github.io/CWG/issues/169.html">169</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td><I>template-id</I>s in <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="170">
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>CD7</td>
     <td>Pointer-to-member conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="171">
     <td><a href="https://cplusplus.github.io/CWG/issues/171.html">171</a></td>
+    <td>[<a href="https://wg21.link/basic.namespace">basic.namespace</a>]</td>
     <td>TC1</td>
     <td>Global namespace scope</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="172">
     <td><a href="https://cplusplus.github.io/CWG/issues/172.html">172</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Unsigned int as underlying type of enum</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="173">
     <td><a href="https://cplusplus.github.io/CWG/issues/173.html">173</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>TC1</td>
     <td>Constraints on execution character set</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="174">
     <td><a href="https://cplusplus.github.io/CWG/issues/174.html">174</a></td>
+    <td>[<a href="https://wg21.link/depr.static">depr.static</a>]</td>
     <td>NAD</td>
     <td>Undeprecating global static</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1012">1012</a></td>
   </tr>
   <tr id="175">
     <td><a href="https://cplusplus.github.io/CWG/issues/175.html">175</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Class name injection and base name access</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="176">
     <td><a href="https://cplusplus.github.io/CWG/issues/176.html">176</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>TC1</td>
     <td>Name injection and templates</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="177">
     <td><a href="https://cplusplus.github.io/CWG/issues/177.html">177</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Lvalues vs rvalues in copy-initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="178">
     <td><a href="https://cplusplus.github.io/CWG/issues/178.html">178</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>More on value-initialization</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="179">
     <td><a href="https://cplusplus.github.io/CWG/issues/179.html">179</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>TC1</td>
     <td>Function pointers and subtraction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="180">
     <td><a href="https://cplusplus.github.io/CWG/issues/180.html">180</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td><TT>typename</TT> and elaborated types</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="181">
     <td><a href="https://cplusplus.github.io/CWG/issues/181.html">181</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>TC1</td>
     <td>Errors in template <I>template-parameter</I> example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="182">
     <td><a href="https://cplusplus.github.io/CWG/issues/182.html">182</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Access checking on explicit specializations</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="183">
     <td><a href="https://cplusplus.github.io/CWG/issues/183.html">183</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td><TT>typename</TT> in explicit specializations</td>
     <td class="full-superseded" align="center">Superseded by <a href="#382">382</a></td>
   </tr>
   <tr id="184">
     <td><a href="https://cplusplus.github.io/CWG/issues/184.html">184</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Default arguments in template <I>template-parameter</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="185">
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>TC1</td>
     <td>"Named" temporaries and copy elision</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="186">
     <td><a href="https://cplusplus.github.io/CWG/issues/186.html">186</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>open</td>
     <td>Name hiding and template <I>template-parameter</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="187">
     <td><a href="https://cplusplus.github.io/CWG/issues/187.html">187</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Scope of template parameter names</td>
     <td class="full-superseded" align="center">Superseded by <a href="#481">481</a></td>
   </tr>
   <tr id="188">
     <td><a href="https://cplusplus.github.io/CWG/issues/188.html">188</a></td>
+    <td>[<a href="https://wg21.link/expr.comma">expr.comma</a>]</td>
     <td>TC1</td>
     <td>Comma operator and rvalue conversion</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="189">
     <td><a href="https://cplusplus.github.io/CWG/issues/189.html">189</a></td>
+    <td>[<a href="https://wg21.link/lex.operators">lex.operators</a>]</td>
     <td>open</td>
     <td>Definition of <I>operator</I> and <I>punctuator</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="190">
     <td><a href="https://cplusplus.github.io/CWG/issues/190.html">190</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>Layout-compatible POD-struct types</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="191">
     <td><a href="https://cplusplus.github.io/CWG/issues/191.html">191</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Name lookup does not handle complex nesting</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="192">
     <td><a href="https://cplusplus.github.io/CWG/issues/192.html">192</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Name lookup in parameters</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="193">
     <td><a href="https://cplusplus.github.io/CWG/issues/193.html">193</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>TC1</td>
     <td>Order of destruction of local automatics of destructor</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="194">
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>TC1</td>
     <td>Identifying constructors</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="195">
     <td><a href="https://cplusplus.github.io/CWG/issues/195.html">195</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD1</td>
     <td>Converting between function and object pointers</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="196">
     <td><a href="https://cplusplus.github.io/CWG/issues/196.html">196</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Arguments to deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="197">
     <td><a href="https://cplusplus.github.io/CWG/issues/197.html">197</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD1</td>
     <td>Issues with two-stage lookup of dependent names</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="198">
     <td><a href="https://cplusplus.github.io/CWG/issues/198.html">198</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>CD1</td>
     <td>Definition of "use" in local and nested classes</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="199">
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of temporaries</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="200">
     <td><a href="https://cplusplus.github.io/CWG/issues/200.html">200</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>dup</td>
     <td>Partial ordering and explicit arguments</td>
     <td class="full" align="center">Duplicate of <a href="#214">214</a></td>
   </tr>
   <tr id="201">
     <td><a href="https://cplusplus.github.io/CWG/issues/201.html">201</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of temporaries in initializers</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="202">
     <td><a href="https://cplusplus.github.io/CWG/issues/202.html">202</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>TC1</td>
     <td>Use of overloaded function name</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="203">
     <td><a href="https://cplusplus.github.io/CWG/issues/203.html">203</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Type of address-of-member expression</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="204">
     <td><a href="https://cplusplus.github.io/CWG/issues/204.html">204</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Exported class templates</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr class="open" id="205">
     <td><a href="https://cplusplus.github.io/CWG/issues/205.html">205</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>drafting</td>
     <td>Templates and static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="206">
     <td><a href="https://cplusplus.github.io/CWG/issues/206.html">206</a></td>
+    <td>[<a href="https://wg21.link/temp.nondep">temp.nondep</a>]</td>
     <td>TC1</td>
     <td>Semantic constraints on non-dependent names</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="207">
     <td><a href="https://cplusplus.github.io/CWG/issues/207.html">207</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and protected access</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="208">
     <td><a href="https://cplusplus.github.io/CWG/issues/208.html">208</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Rethrowing exceptions in nested handlers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="209">
     <td><a href="https://cplusplus.github.io/CWG/issues/209.html">209</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
-    <td>Must friend declaration names be
-accessible?</td>
+    <td>Must friend declaration names be accessible?</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="210">
     <td><a href="https://cplusplus.github.io/CWG/issues/210.html">210</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>TC1</td>
     <td>What is the type matched by an exception handler?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="211">
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>NAD</td>
     <td>Constructors should not be allowed to return normally after an exception</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="212">
     <td><a href="https://cplusplus.github.io/CWG/issues/212.html">212</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD4</td>
     <td>Implicit instantiation is not described clearly enough</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="213">
     <td><a href="https://cplusplus.github.io/CWG/issues/213.html">213</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>TC1</td>
     <td>Lookup in dependent base classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="214">
     <td><a href="https://cplusplus.github.io/CWG/issues/214.html">214</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD1</td>
     <td>Partial ordering of function templates is underspecified</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="215">
     <td><a href="https://cplusplus.github.io/CWG/issues/215.html">215</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Template parameters are not allowed in <I>nested-name-specifier</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="216">
     <td><a href="https://cplusplus.github.io/CWG/issues/216.html">216</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Linkage of nameless class-scope enumeration types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="217">
     <td><a href="https://cplusplus.github.io/CWG/issues/217.html">217</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>Default arguments for non-template member functions of class templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="218">
     <td><a href="https://cplusplus.github.io/CWG/issues/218.html">218</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Specification of Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="219">
     <td><a href="https://cplusplus.github.io/CWG/issues/219.html">219</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>NAD</td>
     <td>Cannot defend against destructors that throw exceptions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="220">
     <td><a href="https://cplusplus.github.io/CWG/issues/220.html">220</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD1</td>
     <td>All deallocation functions should be required not to throw</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="221">
     <td><a href="https://cplusplus.github.io/CWG/issues/221.html">221</a></td>
+    <td>[<a href="https://wg21.link/over.assign">over.assign</a>]</td>
     <td>CD1</td>
     <td>Must compound assignment operators be member functions?</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="222">
     <td><a href="https://cplusplus.github.io/CWG/issues/222.html">222</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Sequence points and lvalue-returning operators</td>
     <td class="full" align="center">Duplicate of <a href="#637">637</a></td>
   </tr>
   <tr id="223">
     <td><a href="https://cplusplus.github.io/CWG/issues/223.html">223</a></td>
+    <td>[<a href="https://wg21.link/depr">depr</a>]</td>
     <td>CD3</td>
     <td>The meaning of deprecation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="224">
     <td><a href="https://cplusplus.github.io/CWG/issues/224.html">224</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD1</td>
     <td>Definition of dependent names</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="225">
     <td><a href="https://cplusplus.github.io/CWG/issues/225.html">225</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Koenig lookup and fundamental types</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="226">
     <td><a href="https://cplusplus.github.io/CWG/issues/226.html">226</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Default template arguments for function templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="227">
     <td><a href="https://cplusplus.github.io/CWG/issues/227.html">227</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>TC1</td>
     <td>How many scopes in an <TT>if</TT> statement?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="228">
     <td><a href="https://cplusplus.github.io/CWG/issues/228.html">228</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Use of <TT>template</TT> keyword with non-member templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="229">
     <td><a href="https://cplusplus.github.io/CWG/issues/229.html">229</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>NAD</td>
     <td>Partial specialization of function templates</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="230">
     <td><a href="https://cplusplus.github.io/CWG/issues/230.html">230</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>NAD</td>
     <td>Calls to pure virtual functions</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="231">
     <td><a href="https://cplusplus.github.io/CWG/issues/231.html">231</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Visibility of names after <I>using-directive</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="232">
     <td><a href="https://cplusplus.github.io/CWG/issues/232.html">232</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Is indirection through a null pointer undefined behavior?</td>
     <td class="none" align="center">Duplicate of <a href="#2823">2823</a></td>
   </tr>
   <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="234">
     <td><a href="https://cplusplus.github.io/CWG/issues/234.html">234</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Reuse of base class subobjects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="235">
     <td><a href="https://cplusplus.github.io/CWG/issues/235.html">235</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>TC1</td>
     <td>Assignment vs initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="236">
     <td><a href="https://cplusplus.github.io/CWG/issues/236.html">236</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Explicit temporaries and integral constant expressions</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="237">
     <td><a href="https://cplusplus.github.io/CWG/issues/237.html">237</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>Explicit instantiation and base class members</td>
     <td class="full" align="center">Duplicate of <a href="#470">470</a></td>
   </tr>
   <tr id="238">
     <td><a href="https://cplusplus.github.io/CWG/issues/238.html">238</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD4</td>
     <td>Precision and accuracy constraints on floating point</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="239">
     <td><a href="https://cplusplus.github.io/CWG/issues/239.html">239</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>CD1</td>
     <td>Footnote 116 and Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="240">
     <td><a href="https://cplusplus.github.io/CWG/issues/240.html">240</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD3</td>
     <td>Uninitialized values and undefined behavior</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="241">
     <td><a href="https://cplusplus.github.io/CWG/issues/241.html">241</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>TC1</td>
     <td>Error in example in 14.8.1</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="242">
     <td><a href="https://cplusplus.github.io/CWG/issues/242.html">242</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD4</td>
     <td>Interpretation of old-style casts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="243">
     <td><a href="https://cplusplus.github.io/CWG/issues/243.html">243</a></td>
+    <td>[<a href="https://wg21.link/over.ics.user">over.ics.user</a>]</td>
     <td>NAD</td>
     <td>Weighting of conversion functions in direct-initialization</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="244">
     <td><a href="https://cplusplus.github.io/CWG/issues/244.html">244</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Destructor lookup</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="245">
     <td><a href="https://cplusplus.github.io/CWG/issues/245.html">245</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>CD1</td>
     <td>Name lookup in <I>elaborated-type-specifier</I>s</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="246">
     <td><a href="https://cplusplus.github.io/CWG/issues/246.html">246</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD1</td>
     <td>Jumps in <I>function-try-block</I> handlers</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="247">
     <td><a href="https://cplusplus.github.io/CWG/issues/247.html">247</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>NAD</td>
     <td>Pointer-to-member casts and function overload resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="248">
     <td><a href="https://cplusplus.github.io/CWG/issues/248.html">248</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>C++11</td>
     <td>Identifier characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="249">
     <td><a href="https://cplusplus.github.io/CWG/issues/249.html">249</a></td>
+    <td>[<a href="https://wg21.link/temp.mem.func">temp.mem.func</a>]</td>
     <td>TC1</td>
     <td>What is a member function template?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="250">
     <td><a href="https://cplusplus.github.io/CWG/issues/250.html">250</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>TC1</td>
     <td>Address of function template specialization with non-deduced template arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="251">
     <td><a href="https://cplusplus.github.io/CWG/issues/251.html">251</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>How many signed integer types are there?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="252">
     <td><a href="https://cplusplus.github.io/CWG/issues/252.html">252</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Looking up deallocation functions in virtual destructors</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="253">
     <td><a href="https://cplusplus.github.io/CWG/issues/253.html">253</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++17</td>
     <td>Why must empty or fully-initialized const objects be initialized?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="254">
     <td><a href="https://cplusplus.github.io/CWG/issues/254.html">254</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>CD1</td>
     <td>Definitional problems with <I>elaborated-type-specifier</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="255">
     <td><a href="https://cplusplus.github.io/CWG/issues/255.html">255</a></td>
+    <td>[<a href="https://wg21.link/class.free">class.free</a>]</td>
     <td>CD6</td>
     <td>Placement deallocation functions and lookup ambiguity</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="256">
     <td><a href="https://cplusplus.github.io/CWG/issues/256.html">256</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Overflow in size calculations</td>
     <td class="unknown" align="center">Duplicate of <a href="#624">624</a></td>
   </tr>
   <tr id="257">
     <td><a href="https://cplusplus.github.io/CWG/issues/257.html">257</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD2</td>
     <td>Abstract base constructors and virtual base initialization</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="258">
     <td><a href="https://cplusplus.github.io/CWG/issues/258.html">258</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and cv-qualifiers</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="259">
     <td><a href="https://cplusplus.github.io/CWG/issues/259.html">259</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD1</td>
     <td>Restrictions on explicit specialization and instantiation</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr class="open" id="260">
     <td><a href="https://cplusplus.github.io/CWG/issues/260.html">260</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>open</td>
     <td>User-defined conversions and built-in <TT>operator=</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="261">
     <td><a href="https://cplusplus.github.io/CWG/issues/261.html">261</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD1</td>
     <td>When is a deallocation function "used?"</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="262">
     <td><a href="https://cplusplus.github.io/CWG/issues/262.html">262</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Default arguments and ellipsis</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="263">
     <td><a href="https://cplusplus.github.io/CWG/issues/263.html">263</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Can a constructor be declared a friend?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="264">
     <td><a href="https://cplusplus.github.io/CWG/issues/264.html">264</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>open</td>
     <td>Unusable template constructors and conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="265">
     <td><a href="https://cplusplus.github.io/CWG/issues/265.html">265</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>dup</td>
     <td>Destructors, exceptions, and deallocation</td>
     <td class="unknown" align="center">Duplicate of <a href="#353">353</a></td>
   </tr>
   <tr id="266">
     <td><a href="https://cplusplus.github.io/CWG/issues/266.html">266</a></td>
+    <td>[<a href="https://wg21.link/gram">gram</a>]</td>
     <td>NAD</td>
     <td>No grammar sentence symbol</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="267">
     <td><a href="https://cplusplus.github.io/CWG/issues/267.html">267</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Alignment requirement for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="268">
     <td><a href="https://cplusplus.github.io/CWG/issues/268.html">268</a></td>
+    <td>[<a href="https://wg21.link/cpp.rescan">cpp.rescan</a>]</td>
     <td>open</td>
     <td>Macro name suppression in rescanned replacement text</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="269">
     <td><a href="https://cplusplus.github.io/CWG/issues/269.html">269</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
-    <td>Order of initialization of multiply-defined static data members
-of class templates</td>
+    <td>Order of initialization of multiply-defined static data members of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="270">
     <td><a href="https://cplusplus.github.io/CWG/issues/270.html">270</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Order of initialization of static data members of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="271">
     <td><a href="https://cplusplus.github.io/CWG/issues/271.html">271</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Explicit instantiation and template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="272">
     <td><a href="https://cplusplus.github.io/CWG/issues/272.html">272</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Explicit destructor invocation and <I>qualified-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="273">
     <td><a href="https://cplusplus.github.io/CWG/issues/273.html">273</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>POD classes and <TT>operator&amp;()</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="274">
     <td><a href="https://cplusplus.github.io/CWG/issues/274.html">274</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Cv-qualification and char-alias access to out-of-lifetime objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="275">
     <td><a href="https://cplusplus.github.io/CWG/issues/275.html">275</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Explicit instantiation/specialization and <I>using-directive</I>s</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="276">
     <td><a href="https://cplusplus.github.io/CWG/issues/276.html">276</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of parameters and temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="277">
     <td><a href="https://cplusplus.github.io/CWG/issues/277.html">277</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Zero-initialization of pointers</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="278">
     <td><a href="https://cplusplus.github.io/CWG/issues/278.html">278</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>External linkage and nameless entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="279">
     <td><a href="https://cplusplus.github.io/CWG/issues/279.html">279</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Correspondence of "names for linkage purposes"</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="280">
     <td><a href="https://cplusplus.github.io/CWG/issues/280.html">280</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>CD1</td>
     <td>Access and surrogate call functions</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="281">
     <td><a href="https://cplusplus.github.io/CWG/issues/281.html">281</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td><TT>inline</TT> specifier in <TT>friend</TT> declarations</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="282">
     <td><a href="https://cplusplus.github.io/CWG/issues/282.html">282</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>open</td>
     <td>Namespace for <TT>extended_type_info</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="283">
     <td><a href="https://cplusplus.github.io/CWG/issues/283.html">283</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Template <I>type-parameter</I>s are not syntactically <I>type-name</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="284">
     <td><a href="https://cplusplus.github.io/CWG/issues/284.html">284</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td><I>qualified-id</I>s in class declarations</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="285">
     <td><a href="https://cplusplus.github.io/CWG/issues/285.html">285</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Identifying a function template being specialized</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="286">
     <td><a href="https://cplusplus.github.io/CWG/issues/286.html">286</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD1</td>
     <td>Incorrect example in partial specialization</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="287">
     <td><a href="https://cplusplus.github.io/CWG/issues/287.html">287</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Order dependencies in template instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="288">
     <td><a href="https://cplusplus.github.io/CWG/issues/288.html">288</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Misuse of "static type" in describing pointers</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="289">
     <td><a href="https://cplusplus.github.io/CWG/issues/289.html">289</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD1</td>
     <td>Incomplete list of contexts requiring a complete type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="290">
     <td><a href="https://cplusplus.github.io/CWG/issues/290.html">290</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Should memcpy be allowed into a POD with a const member?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="291">
     <td><a href="https://cplusplus.github.io/CWG/issues/291.html">291</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Overload resolution needed when binding reference to class rvalue</td>
     <td class="full" align="center">Duplicate of <a href="#391">391</a></td>
   </tr>
   <tr id="292">
     <td><a href="https://cplusplus.github.io/CWG/issues/292.html">292</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>Deallocation on exception in <TT>new</TT> before arguments evaluated</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr class="open" id="293">
     <td><a href="https://cplusplus.github.io/CWG/issues/293.html">293</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>open</td>
     <td>Syntax of explicit instantiation/specialization too permissive</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="294">
     <td><a href="https://cplusplus.github.io/CWG/issues/294.html">294</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>NAD</td>
     <td>Can <TT>static_cast</TT> drop exception specifications?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="295">
     <td><a href="https://cplusplus.github.io/CWG/issues/295.html">295</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>cv-qualifiers on function types</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="296">
     <td><a href="https://cplusplus.github.io/CWG/issues/296.html">296</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD1</td>
     <td>Can conversion functions be static?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="297">
     <td><a href="https://cplusplus.github.io/CWG/issues/297.html">297</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Which template does an explicit specialization specialize?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="298">
     <td><a href="https://cplusplus.github.io/CWG/issues/298.html">298</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD1</td>
     <td><TT>T::x</TT> when <TT>T</TT> is cv-qualified</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="299">
     <td><a href="https://cplusplus.github.io/CWG/issues/299.html">299</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Conversion on array bound expression in <TT>new</TT></td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="300">
     <td><a href="https://cplusplus.github.io/CWG/issues/300.html">300</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>References to functions in template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="301">
     <td><a href="https://cplusplus.github.io/CWG/issues/301.html">301</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Syntax for <I>template-name</I></td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="302">
     <td><a href="https://cplusplus.github.io/CWG/issues/302.html">302</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Value-initialization and generation of default constructor</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="303">
     <td><a href="https://cplusplus.github.io/CWG/issues/303.html">303</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>NAD</td>
     <td>Integral promotions on bit-fields</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="304">
     <td><a href="https://cplusplus.github.io/CWG/issues/304.html">304</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Value-initialization of a reference</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="305">
     <td><a href="https://cplusplus.github.io/CWG/issues/305.html">305</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Name lookup in destructor call</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="306">
     <td><a href="https://cplusplus.github.io/CWG/issues/306.html">306</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD1</td>
     <td>Ambiguity by class name injection</td>
     <td class="none" align="center">Duplicate of <a href="#39">39</a></td>
   </tr>
   <tr id="307">
     <td><a href="https://cplusplus.github.io/CWG/issues/307.html">307</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>NAD</td>
     <td>Initialization of a virtual base class subobject</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="308">
     <td><a href="https://cplusplus.github.io/CWG/issues/308.html">308</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>NAD</td>
     <td>Catching exceptions with ambiguous base classes</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="309">
     <td><a href="https://cplusplus.github.io/CWG/issues/309.html">309</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD1</td>
     <td>Linkage of entities whose names are not simply identifiers, in introduction</td>
     <td class="full" align="center">Duplicate of <a href="#485">485</a></td>
   </tr>
   <tr class="open" id="310">
     <td><a href="https://cplusplus.github.io/CWG/issues/310.html">310</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Can function templates differing only in parameter cv-qualifiers be overloaded?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="311">
     <td><a href="https://cplusplus.github.io/CWG/issues/311.html">311</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>NAD</td>
     <td>Using qualified name to reopen nested namespace</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="312">
     <td><a href="https://cplusplus.github.io/CWG/issues/312.html">312</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD3</td>
     <td>&#8220;use&#8221; of invalid pointer value not defined</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="313">
     <td><a href="https://cplusplus.github.io/CWG/issues/313.html">313</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>dup</td>
     <td>Class with single conversion function to integral as array size in <TT>new</TT></td>
     <td class="full" align="center">Duplicate of <a href="#299">299</a> (C++11 onwards)</td>
   </tr>
   <tr id="314">
     <td><a href="https://cplusplus.github.io/CWG/issues/314.html">314</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td><TT>template</TT> in base class specifier</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="315">
     <td><a href="https://cplusplus.github.io/CWG/issues/315.html">315</a></td>
+    <td>[<a href="https://wg21.link/class.static.mfct">class.static.mfct</a>]</td>
     <td>NAD</td>
     <td>Is call of static member function through null pointer undefined?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="316">
     <td><a href="https://cplusplus.github.io/CWG/issues/316.html">316</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>NAD</td>
     <td>Injected-class-name of template used as template template parameter</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1004">1004</a></td>
   </tr>
   <tr id="317">
     <td><a href="https://cplusplus.github.io/CWG/issues/317.html">317</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Can a function be declared inline after it has been called?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="318">
     <td><a href="https://cplusplus.github.io/CWG/issues/318.html">318</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD1</td>
     <td><TT>struct A::A</TT> should not name the constructor of <TT>A</TT></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1310">1310</a></td>
   </tr>
   <tr id="319">
     <td><a href="https://cplusplus.github.io/CWG/issues/319.html">319</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Use of names without linkage in declaring entities with linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="320">
     <td><a href="https://cplusplus.github.io/CWG/issues/320.html">320</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Question on copy constructor elision example</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="321">
     <td><a href="https://cplusplus.github.io/CWG/issues/321.html">321</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>dup</td>
     <td>Associated classes and namespaces for argument-dependent lookup</td>
     <td class="full" align="center">Duplicate of <a href="#557">557</a></td>
   </tr>
   <tr id="322">
     <td><a href="https://cplusplus.github.io/CWG/issues/322.html">322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD1</td>
     <td>Deduction of reference conversions</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="323">
     <td><a href="https://cplusplus.github.io/CWG/issues/323.html">323</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Where must <TT>export</TT> appear?</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr id="324">
     <td><a href="https://cplusplus.github.io/CWG/issues/324.html">324</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD1</td>
     <td>Can "<TT>&amp;</TT>" be applied to assignment to bit-field?</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="325">
     <td><a href="https://cplusplus.github.io/CWG/issues/325.html">325</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>When are default arguments parsed?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="326">
     <td><a href="https://cplusplus.github.io/CWG/issues/326.html">326</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Wording for definition of trivial constructor</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="327">
     <td><a href="https://cplusplus.github.io/CWG/issues/327.html">327</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Use of "structure" without definition</td>
     <td class="na" align="center">Duplicate of <a href="#538">538</a></td>
   </tr>
   <tr id="328">
     <td><a href="https://cplusplus.github.io/CWG/issues/328.html">328</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Missing requirement that class member types be complete</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="329">
     <td><a href="https://cplusplus.github.io/CWG/issues/329.html">329</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD1</td>
     <td>Evaluation of friends of templates</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="330">
     <td><a href="https://cplusplus.github.io/CWG/issues/330.html">330</a></td>
+    <td>[<a href="https://wg21.link/conv.qual">conv.qual</a>]</td>
     <td>CD4</td>
     <td>Qualification conversions and pointers to arrays of pointers</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="331">
     <td><a href="https://cplusplus.github.io/CWG/issues/331.html">331</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Allowed copy constructor signatures</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="332">
     <td><a href="https://cplusplus.github.io/CWG/issues/332.html">332</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>cv-qualified <TT>void</TT> parameter types</td>
     <td class="full" align="center">Duplicate of <a href="#577">577</a></td>
   </tr>
   <tr id="333">
     <td><a href="https://cplusplus.github.io/CWG/issues/333.html">333</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Ambiguous use of "declaration" in disambiguation section</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="334">
     <td><a href="https://cplusplus.github.io/CWG/issues/334.html">334</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>NAD</td>
     <td>Is a comma-expression dependent if its first operand is?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="335">
     <td><a href="https://cplusplus.github.io/CWG/issues/335.html">335</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Allowing <TT>export</TT> on template members of nontemplate classes</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr id="336">
     <td><a href="https://cplusplus.github.io/CWG/issues/336.html">336</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Explicit specialization examples are still incorrect</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="337">
     <td><a href="https://cplusplus.github.io/CWG/issues/337.html">337</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Attempt to create array of abtract type should cause deduction to fail</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="338">
     <td><a href="https://cplusplus.github.io/CWG/issues/338.html">338</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Enumerator name with linkage used as class name in other translation unit</td>
     <td class="partial" align="center">Duplicate of <a href="#1884">1884</a></td>
   </tr>
   <tr id="339">
     <td><a href="https://cplusplus.github.io/CWG/issues/339.html">339</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Overload resolution in operand of <TT>sizeof</TT> in constant expression</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="340">
     <td><a href="https://cplusplus.github.io/CWG/issues/340.html">340</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Unclear wording in disambiguation section</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="341">
     <td><a href="https://cplusplus.github.io/CWG/issues/341.html">341</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++11</td>
     <td><TT>extern "C"</TT> namespace member function versus global variable</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1708">1708</a></td>
   </tr>
   <tr id="342">
     <td><a href="https://cplusplus.github.io/CWG/issues/342.html">342</a></td>
+    <td>[<a href="https://wg21.link/expr.unary">expr.unary</a>]</td>
     <td>CD3</td>
     <td>Terminology: "indirection" versus "dereference"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="343">
     <td><a href="https://cplusplus.github.io/CWG/issues/343.html">343</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td>Make <TT>template</TT> optional in contexts that require a type</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="344">
     <td><a href="https://cplusplus.github.io/CWG/issues/344.html">344</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD3</td>
     <td>Naming destructors</td>
     <td class="unknown" align="center">Duplicate of <a href="#1435">1435</a></td>
   </tr>
   <tr id="345">
     <td><a href="https://cplusplus.github.io/CWG/issues/345.html">345</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Misleading comment on example in templates chapter</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="346">
     <td><a href="https://cplusplus.github.io/CWG/issues/346.html">346</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Typo in 15.4</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="347">
     <td><a href="https://cplusplus.github.io/CWG/issues/347.html">347</a></td>
+    <td>[<a href="https://wg21.link/class.nest">class.nest</a>]</td>
     <td>NAD</td>
     <td>Use of derived class name in defining base class nested class</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="348">
     <td><a href="https://cplusplus.github.io/CWG/issues/348.html">348</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD1</td>
     <td><TT>delete</TT> and user-written deallocation functions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="349">
     <td><a href="https://cplusplus.github.io/CWG/issues/349.html">349</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD1</td>
     <td>Template argument deduction for conversion functions and qualification conversions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="350">
     <td><a href="https://cplusplus.github.io/CWG/issues/350.html">350</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>open</td>
     <td><TT>signed char</TT> underlying representation for objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="351">
     <td><a href="https://cplusplus.github.io/CWG/issues/351.html">351</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Sequence point error: unspecified or undefined?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="352">
     <td><a href="https://cplusplus.github.io/CWG/issues/352.html">352</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Nondeduced contexts</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="353">
     <td><a href="https://cplusplus.github.io/CWG/issues/353.html">353</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Is deallocation routine called if destructor throws exception in delete?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="354">
     <td><a href="https://cplusplus.github.io/CWG/issues/354.html">354</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD1</td>
     <td>Null as nontype template argument</td>
     <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="355">
     <td><a href="https://cplusplus.github.io/CWG/issues/355.html">355</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Global-scope <TT>::</TT> in <I>nested-name-specifier</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="356">
     <td><a href="https://cplusplus.github.io/CWG/issues/356.html">356</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Wording of behavior of generated copy constructor for scalar members</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="357">
     <td><a href="https://cplusplus.github.io/CWG/issues/357.html">357</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD1</td>
     <td>Definition of signature should include name</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="358">
     <td><a href="https://cplusplus.github.io/CWG/issues/358.html">358</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Namespaces and extern "C"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="359">
     <td><a href="https://cplusplus.github.io/CWG/issues/359.html">359</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>NAD</td>
     <td>Type definition in anonymous union</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="360">
     <td><a href="https://cplusplus.github.io/CWG/issues/360.html">360</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD6</td>
     <td>Using-declaration that reduces access</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="361">
     <td><a href="https://cplusplus.github.io/CWG/issues/361.html">361</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Forward reference to default argument</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="362">
     <td><a href="https://cplusplus.github.io/CWG/issues/362.html">362</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD1</td>
     <td>Order of initialization in instantiation units</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="363">
     <td><a href="https://cplusplus.github.io/CWG/issues/363.html">363</a></td>
+    <td>[<a href="https://wg21.link/class.expl.init">class.expl.init</a>]</td>
     <td>NAD</td>
     <td>Initialization of class from self</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="364">
     <td><a href="https://cplusplus.github.io/CWG/issues/364.html">364</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>CD1</td>
     <td>Calling overloaded function with static in set, with no object</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="365">
     <td><a href="https://cplusplus.github.io/CWG/issues/365.html">365</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>open</td>
     <td>Storage duration and temporaries</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="366">
     <td><a href="https://cplusplus.github.io/CWG/issues/366.html">366</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>String literal allowed in integral constant expression?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="367">
     <td><a href="https://cplusplus.github.io/CWG/issues/367.html">367</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td><TT>throw</TT> operator allowed in constant expression?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="368">
     <td><a href="https://cplusplus.github.io/CWG/issues/368.html">368</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Uses of non-type parameters that should cause deduction to fail</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="369">
     <td><a href="https://cplusplus.github.io/CWG/issues/369.html">369</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>open</td>
     <td>Are <TT>new</TT>/<TT>delete</TT> identifiers or <I>preprocessing-op-or-punc</I>?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="370">
     <td><a href="https://cplusplus.github.io/CWG/issues/370.html">370</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>CD1</td>
     <td>Can <TT>#include &lt;...&gt;</TT> form be used other than for standard C++ headers?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="371">
     <td><a href="https://cplusplus.github.io/CWG/issues/371.html">371</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Interleaving of constructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="372">
     <td><a href="https://cplusplus.github.io/CWG/issues/372.html">372</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD1</td>
     <td>Is access granted by base class specifiers available in following base class specifiers?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="373">
     <td><a href="https://cplusplus.github.io/CWG/issues/373.html">373</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.udir">basic.lookup.udir</a>]</td>
     <td>C++11</td>
     <td>Lookup on namespace qualified name in using-directive</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="374">
     <td><a href="https://cplusplus.github.io/CWG/issues/374.html">374</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD2</td>
     <td>Can explicit specialization outside namespace use qualified name?</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="375">
     <td><a href="https://cplusplus.github.io/CWG/issues/375.html">375</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>dup</td>
     <td>Confusing example on lookup with <TT>typename</TT></td>
     <td class="full" align="center">Duplicate of <a href="#345">345</a></td>
   </tr>
   <tr id="376">
     <td><a href="https://cplusplus.github.io/CWG/issues/376.html">376</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>NAD</td>
     <td>Class "definition" versus class "declaration"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="377">
     <td><a href="https://cplusplus.github.io/CWG/issues/377.html">377</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Enum whose enumerators will not fit in any integral type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="378">
     <td><a href="https://cplusplus.github.io/CWG/issues/378.html">378</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>CD1</td>
     <td>Wording that says temporaries are declared</td>
     <td class="na" align="center">Duplicate of <a href="#276">276</a></td>
   </tr>
   <tr id="379">
     <td><a href="https://cplusplus.github.io/CWG/issues/379.html">379</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Change "class declaration" to "class definition"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="380">
     <td><a href="https://cplusplus.github.io/CWG/issues/380.html">380</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>open</td>
     <td>Definition of "ambiguous base class" missing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="381">
     <td><a href="https://cplusplus.github.io/CWG/issues/381.html">381</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Incorrect example of base class member lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="382">
     <td><a href="https://cplusplus.github.io/CWG/issues/382.html">382</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Allow <TT>typename</TT> outside of templates</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="383">
     <td><a href="https://cplusplus.github.io/CWG/issues/383.html">383</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Is a class with a declared but not defined destructor a POD?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="384">
     <td><a href="https://cplusplus.github.io/CWG/issues/384.html">384</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Argument-dependent lookup and operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="385">
     <td><a href="https://cplusplus.github.io/CWG/issues/385.html">385</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>CD1</td>
     <td>How does protected member check of 11.5 interact with using-declarations?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="386">
     <td><a href="https://cplusplus.github.io/CWG/issues/386.html">386</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td>Friend declaration of name brought in by <I>using-declaration</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="387">
     <td><a href="https://cplusplus.github.io/CWG/issues/387.html">387</a></td>
+    <td>[<a href="https://wg21.link/temp.inject">temp.inject</a>]</td>
     <td>CD1</td>
     <td>Errors in example in 14.6.5</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="388">
     <td><a href="https://cplusplus.github.io/CWG/issues/388.html">388</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD3</td>
     <td>Catching base<TT>*&amp;</TT> from a throw of derived<TT>*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="389">
     <td><a href="https://cplusplus.github.io/CWG/issues/389.html">389</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Unnamed types in entities with linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="390">
     <td><a href="https://cplusplus.github.io/CWG/issues/390.html">390</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>CD1</td>
     <td>Pure virtual must be defined when implicitly called</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="391">
     <td><a href="https://cplusplus.github.io/CWG/issues/391.html">391</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Require direct binding of short-lived references to rvalues</td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="392">
     <td><a href="https://cplusplus.github.io/CWG/issues/392.html">392</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Use of full expression lvalue before temporary destruction</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="393">
     <td><a href="https://cplusplus.github.io/CWG/issues/393.html">393</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD4</td>
     <td>Pointer to array of unknown bound in template argument list in parameter</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="394">
     <td><a href="https://cplusplus.github.io/CWG/issues/394.html">394</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD1</td>
     <td><I>identifier-list</I> is never defined</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="395">
     <td><a href="https://cplusplus.github.io/CWG/issues/395.html">395</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>NAD</td>
     <td>Conversion operator template syntax</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="396">
     <td><a href="https://cplusplus.github.io/CWG/issues/396.html">396</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Misleading note regarding use of <TT>auto</TT> for disambiguation</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="397">
     <td><a href="https://cplusplus.github.io/CWG/issues/397.html">397</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Same address for string literals from default arguments in inline functions?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1823">1823</a></td>
   </tr>
   <tr id="398">
     <td><a href="https://cplusplus.github.io/CWG/issues/398.html">398</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Ambiguous wording on naming a type in deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="399">
     <td><a href="https://cplusplus.github.io/CWG/issues/399.html">399</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD6</td>
     <td>Destructor lookup redux</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="400">
     <td><a href="https://cplusplus.github.io/CWG/issues/400.html">400</a></td>
+    <td>[<a href="https://wg21.link/namespace.qual">namespace.qual</a>]</td>
     <td>CD1</td>
     <td>Using-declarations and the "struct hack"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="401">
     <td><a href="https://cplusplus.github.io/CWG/issues/401.html">401</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>When is access for template parameter default arguments checked?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="402">
     <td><a href="https://cplusplus.github.io/CWG/issues/402.html">402</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>More on partial ordering of function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="403">
     <td><a href="https://cplusplus.github.io/CWG/issues/403.html">403</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Reference to a type as a <I>template-id</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="404">
     <td><a href="https://cplusplus.github.io/CWG/issues/404.html">404</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Unclear reference to construction with non-trivial constructor</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="405">
     <td><a href="https://cplusplus.github.io/CWG/issues/405.html">405</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Unqualified function name lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="406">
     <td><a href="https://cplusplus.github.io/CWG/issues/406.html">406</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD1</td>
     <td>Static data member in class with name for linkage purposes</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="407">
     <td><a href="https://cplusplus.github.io/CWG/issues/407.html">407</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>C++11</td>
     <td>Named class with associated typedef: two names or one?</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="408">
     <td><a href="https://cplusplus.github.io/CWG/issues/408.html">408</a></td>
+    <td>[<a href="https://wg21.link/temp.static">temp.static</a>]</td>
     <td>CD2</td>
     <td>sizeof applied to unknown-bound array static data member of template</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="409">
     <td><a href="https://cplusplus.github.io/CWG/issues/409.html">409</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Obsolete paragraph missed by changes for issue 224</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="410">
     <td><a href="https://cplusplus.github.io/CWG/issues/410.html">410</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD1</td>
     <td>Paragraph missed in changes for issue 166</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="411">
     <td><a href="https://cplusplus.github.io/CWG/issues/411.html">411</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD6</td>
     <td>Use of universal-character-name in character versus string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="412">
     <td><a href="https://cplusplus.github.io/CWG/issues/412.html">412</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>NAD</td>
     <td>Can a replacement allocation function be inline?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="413">
     <td><a href="https://cplusplus.github.io/CWG/issues/413.html">413</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Definition of "empty class"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="414">
     <td><a href="https://cplusplus.github.io/CWG/issues/414.html">414</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Multiple types found on destructor lookup</td>
     <td class="none" align="center">Duplicate of <a href="#305">305</a></td>
   </tr>
   <tr id="415">
     <td><a href="https://cplusplus.github.io/CWG/issues/415.html">415</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>CD1</td>
     <td>Template deduction does not cause instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="416">
     <td><a href="https://cplusplus.github.io/CWG/issues/416.html">416</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD1</td>
     <td>Class must be complete to allow operator lookup?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="417">
     <td><a href="https://cplusplus.github.io/CWG/issues/417.html">417</a></td>
+    <td>[<a href="https://wg21.link/class.name">class.name</a>]</td>
     <td>CD1</td>
     <td>Using derived-class qualified name in out-of-class nested class definition</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="418">
     <td><a href="https://cplusplus.github.io/CWG/issues/418.html">418</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>CD6</td>
     <td>Imperfect wording on error on multiple default arguments on a called function</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="419">
     <td><a href="https://cplusplus.github.io/CWG/issues/419.html">419</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Can cast to virtual base class be done on partially-constructed object?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="420">
     <td><a href="https://cplusplus.github.io/CWG/issues/420.html">420</a></td>
+    <td>[<a href="https://wg21.link/over.ref">over.ref</a>]</td>
     <td>CD1</td>
     <td>postfixexpression-&gt;scalar_type_dtor() inconsistent</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="421">
     <td><a href="https://cplusplus.github.io/CWG/issues/421.html">421</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD1</td>
     <td>Is rvalue.field an rvalue?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="422">
     <td><a href="https://cplusplus.github.io/CWG/issues/422.html">422</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>NAD</td>
     <td>Is a typedef redeclaration allowed with a template type that might be the same?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="423">
     <td><a href="https://cplusplus.github.io/CWG/issues/423.html">423</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>NAD</td>
     <td>Can a conversion be done on the left operand of a compound assignment?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="424">
     <td><a href="https://cplusplus.github.io/CWG/issues/424.html">424</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD1</td>
     <td>Wording problem with issue 56 resolution on redeclaring typedefs in class scope</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="425">
     <td><a href="https://cplusplus.github.io/CWG/issues/425.html">425</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD1</td>
     <td>Set of candidates for overloaded built-in operator with float operand</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="426">
     <td><a href="https://cplusplus.github.io/CWG/issues/426.html">426</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++17</td>
     <td>Identically-named variables, one internally and one externally linked, allowed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="427">
     <td><a href="https://cplusplus.github.io/CWG/issues/427.html">427</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td><TT>static_cast</TT> ambiguity: conversion versus cast to derived</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="428">
     <td><a href="https://cplusplus.github.io/CWG/issues/428.html">428</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Mention of expression with reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="429">
     <td><a href="https://cplusplus.github.io/CWG/issues/429.html">429</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Matching deallocation function chosen based on syntax or signature?</td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="430">
     <td><a href="https://cplusplus.github.io/CWG/issues/430.html">430</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Ordering of expression evaluation in initializer list</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="431">
     <td><a href="https://cplusplus.github.io/CWG/issues/431.html">431</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++11</td>
     <td>Defect in wording in 14.2</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="432">
     <td><a href="https://cplusplus.github.io/CWG/issues/432.html">432</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD1</td>
     <td>Is injected class name visible in base class specifier list?</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="433">
     <td><a href="https://cplusplus.github.io/CWG/issues/433.html">433</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>CD1</td>
     <td>Do elaborated type specifiers in templates inject into enclosing namespace scope?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="434">
     <td><a href="https://cplusplus.github.io/CWG/issues/434.html">434</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Unclear suppression of standard conversions while binding reference to lvalue</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2352">2352</a></td>
   </tr>
   <tr id="435">
     <td><a href="https://cplusplus.github.io/CWG/issues/435.html">435</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Change "declararation or definition" to "declaration"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="436">
     <td><a href="https://cplusplus.github.io/CWG/issues/436.html">436</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD1</td>
     <td>Problem in example in 9.6 paragraph 4</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="437">
     <td><a href="https://cplusplus.github.io/CWG/issues/437.html">437</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Is type of class allowed in member function exception specification?</td>
     <td class="full-superseded-superseded" align="center">Superseded by <a href="#1308">1308</a></td>
   </tr>
   <tr id="438">
     <td><a href="https://cplusplus.github.io/CWG/issues/438.html">438</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Possible flaw in wording for multiple accesses to object between sequence points</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="439">
     <td><a href="https://cplusplus.github.io/CWG/issues/439.html">439</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Guarantees on casting pointer back to cv-qualified version of original type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="440">
     <td><a href="https://cplusplus.github.io/CWG/issues/440.html">440</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>NAD</td>
     <td>Allow implicit pointer-to-member conversion on nontype template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="441">
     <td><a href="https://cplusplus.github.io/CWG/issues/441.html">441</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Ordering of static reference initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="442">
     <td><a href="https://cplusplus.github.io/CWG/issues/442.html">442</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Incorrect use of null pointer constant in description of delete operator</td>
     <td class="na-superseded" align="center">Superseded by <a href="#348">348</a></td>
   </tr>
   <tr id="443">
     <td><a href="https://cplusplus.github.io/CWG/issues/443.html">443</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Wording nit in description of lifetime of temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="444">
     <td><a href="https://cplusplus.github.io/CWG/issues/444.html">444</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Overriding and the generated copy assignment operator</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="445">
     <td><a href="https://cplusplus.github.io/CWG/issues/445.html">445</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Wording issue on friend declarations</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="446">
     <td><a href="https://cplusplus.github.io/CWG/issues/446.html">446</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD1</td>
     <td>Does an lvalue-to-rvalue conversion on the "?" operator produce a temporary?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="447">
     <td><a href="https://cplusplus.github.io/CWG/issues/447.html">447</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD1</td>
     <td>Is offsetof type-dependent?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="448">
     <td><a href="https://cplusplus.github.io/CWG/issues/448.html">448</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Set of template functions in call with dependent explicit argument</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="449">
     <td><a href="https://cplusplus.github.io/CWG/issues/449.html">449</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>NAD</td>
     <td>Consistency in use of hyphen with names of "non" entities</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="450">
     <td><a href="https://cplusplus.github.io/CWG/issues/450.html">450</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Binding a reference to const to a cv-qualified array rvalue</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="451">
     <td><a href="https://cplusplus.github.io/CWG/issues/451.html">451</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Expressions with invalid results and ill-formedness</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="452">
     <td><a href="https://cplusplus.github.io/CWG/issues/452.html">452</a></td>
+    <td>[<a href="https://wg21.link/class.this">class.this</a>]</td>
     <td>CD1</td>
     <td>Wording nit on description of <TT>this</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD7</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="454">
     <td><a href="https://cplusplus.github.io/CWG/issues/454.html">454</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD1</td>
     <td>When is a definition of a static data member required?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="455">
     <td><a href="https://cplusplus.github.io/CWG/issues/455.html">455</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>NAD</td>
     <td>Partial ordering and non-deduced arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="456">
     <td><a href="https://cplusplus.github.io/CWG/issues/456.html">456</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>NAD</td>
     <td>Is initialized const int or const bool variable a null pointer constant?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="457">
     <td><a href="https://cplusplus.github.io/CWG/issues/457.html">457</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Wording nit on use of const variables in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="458">
     <td><a href="https://cplusplus.github.io/CWG/issues/458.html">458</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Hiding of member template parameters by other members</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="459">
     <td><a href="https://cplusplus.github.io/CWG/issues/459.html">459</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>NAD</td>
     <td>Hiding of template parameters by base class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="460">
     <td><a href="https://cplusplus.github.io/CWG/issues/460.html">460</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td>Can a <I>using-declaration</I> name a namespace?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="461">
     <td><a href="https://cplusplus.github.io/CWG/issues/461.html">461</a></td>
+    <td>[<a href="https://wg21.link/dcl.asm">dcl.asm</a>]</td>
     <td>NAD</td>
     <td>Make <TT>asm</TT> conditionally-supported</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="462">
     <td><a href="https://cplusplus.github.io/CWG/issues/462.html">462</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD3</td>
     <td>Lifetime of temporaries bound to comma expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="463">
     <td><a href="https://cplusplus.github.io/CWG/issues/463.html">463</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD1</td>
     <td><TT>reinterpret_cast&lt;T*&gt;(0)</TT></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="464">
     <td><a href="https://cplusplus.github.io/CWG/issues/464.html">464</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Wording nit on lifetime of temporaries to which references are bound</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="465">
     <td><a href="https://cplusplus.github.io/CWG/issues/465.html">465</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
     <td>May constructors of global objects call <TT>exit()</TT>?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="466">
     <td><a href="https://cplusplus.github.io/CWG/issues/466.html">466</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>CD1</td>
     <td>cv-qualifiers on pseudo-destructor type</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="467">
     <td><a href="https://cplusplus.github.io/CWG/issues/467.html">467</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>NAD</td>
     <td>Jump past initialization of local static variable</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="468">
     <td><a href="https://cplusplus.github.io/CWG/issues/468.html">468</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Allow <TT>::template</TT> outside of templates</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="469">
     <td><a href="https://cplusplus.github.io/CWG/issues/469.html">469</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Const template specializations and reference arguments</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="470">
     <td><a href="https://cplusplus.github.io/CWG/issues/470.html">470</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>Instantiation of members of an explicitly-instantiated class template</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="471">
     <td><a href="https://cplusplus.github.io/CWG/issues/471.html">471</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Conflicting inherited access specifications</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="472">
     <td><a href="https://cplusplus.github.io/CWG/issues/472.html">472</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>open</td>
     <td>Casting across protected inheritance</td>
     <td align="center">
@@ -2887,5528 +3358,6447 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="473">
     <td><a href="https://cplusplus.github.io/CWG/issues/473.html">473</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Block-scope declarations of allocator functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="474">
     <td><a href="https://cplusplus.github.io/CWG/issues/474.html">474</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Block-scope <TT>extern</TT> declarations in namespace members</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="475">
     <td><a href="https://cplusplus.github.io/CWG/issues/475.html">475</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>C++11</td>
     <td>When is <TT>std::uncaught_exception()</TT> true? (take 2)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="476">
     <td><a href="https://cplusplus.github.io/CWG/issues/476.html">476</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Determining the buffer size for placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="477">
     <td><a href="https://cplusplus.github.io/CWG/issues/477.html">477</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Can <TT>virtual</TT> appear in a <TT>friend</TT> declaration?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="478">
     <td><a href="https://cplusplus.github.io/CWG/issues/478.html">478</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>May a function parameter be an array of an abstract class type?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="479">
     <td><a href="https://cplusplus.github.io/CWG/issues/479.html">479</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Copy elision in exception handling</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="480">
     <td><a href="https://cplusplus.github.io/CWG/issues/480.html">480</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>CD1</td>
     <td>Is a base of a virtual base also virtual?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="481">
     <td><a href="https://cplusplus.github.io/CWG/issues/481.html">481</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>CD2</td>
     <td>Scope of template parameters</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="482">
     <td><a href="https://cplusplus.github.io/CWG/issues/482.html">482</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD3</td>
     <td>Qualified declarators in redeclarations</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="483">
     <td><a href="https://cplusplus.github.io/CWG/issues/483.html">483</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Normative requirements on integral ranges</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="484">
     <td><a href="https://cplusplus.github.io/CWG/issues/484.html">484</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>CD1</td>
     <td>Can a <I>base-specifier</I> name a cv-qualified class type?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="485">
     <td><a href="https://cplusplus.github.io/CWG/issues/485.html">485</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD1</td>
     <td>What is a &#8220;name&#8221;?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="486">
     <td><a href="https://cplusplus.github.io/CWG/issues/486.html">486</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Invalid return types and template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="487">
     <td><a href="https://cplusplus.github.io/CWG/issues/487.html">487</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Operator overloading in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="488">
     <td><a href="https://cplusplus.github.io/CWG/issues/488.html">488</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Local types, overload resolution, and template argument deduction</td>
     <td class="full" align="center">Clang 2.9 (C++11 onwards)</td>
   </tr>
   <tr id="489">
     <td><a href="https://cplusplus.github.io/CWG/issues/489.html">489</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Must member function templates be instantiated during overload resolution?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="490">
     <td><a href="https://cplusplus.github.io/CWG/issues/490.html">490</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD2</td>
     <td>Name lookup in friend declarations</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="491">
     <td><a href="https://cplusplus.github.io/CWG/issues/491.html">491</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Initializers for empty-class aggregrate members</td>
     <td class="full" align="center">Duplicate of <a href="#413">413</a></td>
   </tr>
   <tr id="492">
     <td><a href="https://cplusplus.github.io/CWG/issues/492.html">492</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD1</td>
     <td><TT>typeid</TT> constness inconsistent with example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="493">
     <td><a href="https://cplusplus.github.io/CWG/issues/493.html">493</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Type deduction from a <TT>bool</TT> context</td>
     <td class="unknown" align="center">Duplicate of <a href="#976">976</a></td>
   </tr>
   <tr id="494">
     <td><a href="https://cplusplus.github.io/CWG/issues/494.html">494</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD1</td>
     <td>Problems with the resolution of issue 45</td>
     <td class="none" align="center">Duplicate of <a href="#372">372</a></td>
   </tr>
   <tr id="495">
     <td><a href="https://cplusplus.github.io/CWG/issues/495.html">495</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>CD2</td>
     <td>Overload resolution with template and non-template conversion functions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="496">
     <td><a href="https://cplusplus.github.io/CWG/issues/496.html">496</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Is a volatile-qualified type really a POD?</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2094">2094</a></td>
   </tr>
   <tr id="497">
     <td><a href="https://cplusplus.github.io/CWG/issues/497.html">497</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>CD1</td>
     <td>Missing required initialization in example</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#253">253</a></td>
   </tr>
   <tr class="open" id="498">
     <td><a href="https://cplusplus.github.io/CWG/issues/498.html">498</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>open</td>
     <td>Storage class specifiers in definitions of class members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="499">
     <td><a href="https://cplusplus.github.io/CWG/issues/499.html">499</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD2</td>
     <td>Throwing an array of unknown size</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="500">
     <td><a href="https://cplusplus.github.io/CWG/issues/500.html">500</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD1</td>
     <td>Access in <I>base-specifier</I>s of friend and nested classes</td>
     <td class="none" align="center">Duplicate of <a href="#372">372</a></td>
   </tr>
   <tr id="501">
     <td><a href="https://cplusplus.github.io/CWG/issues/501.html">501</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Visibility of friend declarations within the befriending class</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="502">
     <td><a href="https://cplusplus.github.io/CWG/issues/502.html">502</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Dependency of nested enumerations and enumerators</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="503">
     <td><a href="https://cplusplus.github.io/CWG/issues/503.html">503</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Cv-qualified function types in template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="504">
     <td><a href="https://cplusplus.github.io/CWG/issues/504.html">504</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>NAD</td>
     <td>Should use of a variable in its own initializer require a diagnostic?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="505">
     <td><a href="https://cplusplus.github.io/CWG/issues/505.html">505</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for unknown character escapes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="506">
     <td><a href="https://cplusplus.github.io/CWG/issues/506.html">506</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="507">
     <td><a href="https://cplusplus.github.io/CWG/issues/507.html">507</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>dup</td>
     <td>Ambiguity assigning class object to built-in type</td>
     <td class="unknown" align="center">Duplicate of <a href="#260">260</a></td>
   </tr>
   <tr id="508">
     <td><a href="https://cplusplus.github.io/CWG/issues/508.html">508</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Non-constructed value-initialized objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="509">
     <td><a href="https://cplusplus.github.io/CWG/issues/509.html">509</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Dead code in the specification of default initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="510">
     <td><a href="https://cplusplus.github.io/CWG/issues/510.html">510</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>CD1</td>
     <td>Default initialization of POD classes?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="511">
     <td><a href="https://cplusplus.github.io/CWG/issues/511.html">511</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>NAD</td>
     <td>POD-structs with template assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="512">
     <td><a href="https://cplusplus.github.io/CWG/issues/512.html">512</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>NAD</td>
     <td>Union members with user-declared non-default constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="513">
     <td><a href="https://cplusplus.github.io/CWG/issues/513.html">513</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD1</td>
     <td>Non-class &#8220;most-derived&#8221; objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="514">
     <td><a href="https://cplusplus.github.io/CWG/issues/514.html">514</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD1</td>
     <td>Is the initializer for a namespace member in the scope of the namespace?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="515">
     <td><a href="https://cplusplus.github.io/CWG/issues/515.html">515</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD1</td>
     <td>Non-dependent references to base class members</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1017">1017</a></td>
   </tr>
   <tr id="516">
     <td><a href="https://cplusplus.github.io/CWG/issues/516.html">516</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Use of <TT>signed</TT> in bit-field declarations</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="517">
     <td><a href="https://cplusplus.github.io/CWG/issues/517.html">517</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD1</td>
     <td>Partial specialization following explicit instantiation</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="518">
     <td><a href="https://cplusplus.github.io/CWG/issues/518.html">518</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Trailing comma following <I>enumerator-list</I></td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="519">
     <td><a href="https://cplusplus.github.io/CWG/issues/519.html">519</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD1</td>
     <td>Null pointer preservation in <TT>void*</TT> conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="520">
     <td><a href="https://cplusplus.github.io/CWG/issues/520.html">520</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD1</td>
     <td>Old-style casts between incomplete class types</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="521">
     <td><a href="https://cplusplus.github.io/CWG/issues/521.html">521</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD1</td>
     <td>Requirements for exceptions thrown by allocation functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="522">
     <td><a href="https://cplusplus.github.io/CWG/issues/522.html">522</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Array-to-pointer decay in template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="523">
     <td><a href="https://cplusplus.github.io/CWG/issues/523.html">523</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>open</td>
     <td>Can a one-past-the-end pointer be invalidated by deleting an adjacent object?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="524">
     <td><a href="https://cplusplus.github.io/CWG/issues/524.html">524</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD1</td>
     <td>Can function-notation calls to operator functions be dependent?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="525">
     <td><a href="https://cplusplus.github.io/CWG/issues/525.html">525</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD1</td>
     <td>Missing <TT>*</TT> in example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="526">
     <td><a href="https://cplusplus.github.io/CWG/issues/526.html">526</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>Confusing aspects in the specification of non-deduced contexts</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="527">
     <td><a href="https://cplusplus.github.io/CWG/issues/527.html">527</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Problems with linkage of types</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="528">
     <td><a href="https://cplusplus.github.io/CWG/issues/528.html">528</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>NAD</td>
     <td>Why are incomplete class types not allowed with <TT>typeid</TT>?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="529">
     <td><a href="https://cplusplus.github.io/CWG/issues/529.html">529</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>open</td>
     <td>Use of <TT>template&lt;&gt;</TT> with &#8220;explicitly-specialized&#8221; class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="530">
     <td><a href="https://cplusplus.github.io/CWG/issues/530.html">530</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Nontype template arguments in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="531">
     <td><a href="https://cplusplus.github.io/CWG/issues/531.html">531</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Defining members of explicit specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="532">
     <td><a href="https://cplusplus.github.io/CWG/issues/532.html">532</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>Member/nonmember operator template partial ordering</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="533">
     <td><a href="https://cplusplus.github.io/CWG/issues/533.html">533</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>NAD</td>
     <td>Special treatment for C-style header names</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="534">
     <td><a href="https://cplusplus.github.io/CWG/issues/534.html">534</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td><I>template-name</I>s and <I>operator-function-id</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="535">
     <td><a href="https://cplusplus.github.io/CWG/issues/535.html">535</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Copy construction without a copy constructor</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="536">
     <td><a href="https://cplusplus.github.io/CWG/issues/536.html">536</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD6</td>
     <td>Problems in the description of <I>id-expression</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="537">
     <td><a href="https://cplusplus.github.io/CWG/issues/537.html">537</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD1</td>
     <td>Definition of &#8220;signature&#8221;</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="538">
     <td><a href="https://cplusplus.github.io/CWG/issues/538.html">538</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
-    <td>Definition and usage
-of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>,
-and <I>POD class</I></td>
+    <td>Definition and usage of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>, and <I>POD class</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="539">
     <td><a href="https://cplusplus.github.io/CWG/issues/539.html">539</a></td>
+    <td>[<a href="https://wg21.link/dcl.type">dcl.type</a>]</td>
     <td>CD3</td>
     <td>Constraints on <I>type-specifier-seq</I></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="540">
     <td><a href="https://cplusplus.github.io/CWG/issues/540.html">540</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD1</td>
     <td>Propagation of cv-qualifiers in reference-to-reference collapse</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="541">
     <td><a href="https://cplusplus.github.io/CWG/issues/541.html">541</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD2</td>
     <td>Dependent function types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="542">
     <td><a href="https://cplusplus.github.io/CWG/issues/542.html">542</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>CD2</td>
     <td>Value initialization of arrays of POD-structs</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="543">
     <td><a href="https://cplusplus.github.io/CWG/issues/543.html">543</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Value initialization and default constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="544">
     <td><a href="https://cplusplus.github.io/CWG/issues/544.html">544</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>NAD</td>
     <td>Base class lookup in explicit specialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="545">
     <td><a href="https://cplusplus.github.io/CWG/issues/545.html">545</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>User-defined conversions and built-in operator overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="546">
     <td><a href="https://cplusplus.github.io/CWG/issues/546.html">546</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>C++11</td>
     <td>Explicit instantiation of class template members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="547">
     <td><a href="https://cplusplus.github.io/CWG/issues/547.html">547</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Partial specialization on member function types</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="548">
     <td><a href="https://cplusplus.github.io/CWG/issues/548.html">548</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>dup</td>
     <td><I>qualified-id</I>s in declarations</td>
     <td class="full" align="center">Duplicate of <a href="#482">482</a></td>
   </tr>
   <tr class="open" id="549">
     <td><a href="https://cplusplus.github.io/CWG/issues/549.html">549</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.match">temp.spec.partial.match</a>]</td>
     <td>drafting</td>
     <td>Non-deducible parameters in partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="550">
     <td><a href="https://cplusplus.github.io/CWG/issues/550.html">550</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>dup</td>
     <td>Pointer to array of unknown bound in parameter declarations</td>
     <td class="full" align="center">Duplicate of <a href="#393">393</a></td>
   </tr>
   <tr id="551">
     <td><a href="https://cplusplus.github.io/CWG/issues/551.html">551</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>When is <TT>inline</TT> permitted in an explicit instantiation?</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="552">
     <td><a href="https://cplusplus.github.io/CWG/issues/552.html">552</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>typename</TT> in the type in a non-type <I>parameter-declaration</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="553">
     <td><a href="https://cplusplus.github.io/CWG/issues/553.html">553</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Problems with friend allocation and deallocation functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="554">
     <td><a href="https://cplusplus.github.io/CWG/issues/554.html">554</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>CD6</td>
     <td>Definition of &#8220;declarative region&#8221; and &#8220;scope&#8221;</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="555">
     <td><a href="https://cplusplus.github.io/CWG/issues/555.html">555</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup">basic.lookup</a>]</td>
     <td>CD5</td>
     <td>Pseudo-destructor name lookup</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="556">
     <td><a href="https://cplusplus.github.io/CWG/issues/556.html">556</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD2</td>
     <td>Conflicting requirements for acceptable aliasing</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="557">
     <td><a href="https://cplusplus.github.io/CWG/issues/557.html">557</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Does argument-dependent lookup cause template instantiation?</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="558">
     <td><a href="https://cplusplus.github.io/CWG/issues/558.html">558</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD1</td>
     <td>Excluded characters in universal character names</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="559">
     <td><a href="https://cplusplus.github.io/CWG/issues/559.html">559</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Editing error in issue 382 resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="560">
     <td><a href="https://cplusplus.github.io/CWG/issues/560.html">560</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Use of the <TT>typename</TT> keyword in return types</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="561">
     <td><a href="https://cplusplus.github.io/CWG/issues/561.html">561</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD2</td>
     <td>Internal linkage functions in dependent name lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="562">
     <td><a href="https://cplusplus.github.io/CWG/issues/562.html">562</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD6</td>
     <td><I>qualified-id</I>s in non-expression contexts</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="563">
     <td><a href="https://cplusplus.github.io/CWG/issues/563.html">563</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>Linkage specification for objects</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="564">
     <td><a href="https://cplusplus.github.io/CWG/issues/564.html">564</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD2</td>
     <td>Agreement of language linkage or <I>linkage-specification</I>s?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="565">
     <td><a href="https://cplusplus.github.io/CWG/issues/565.html">565</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD3</td>
     <td>Conflict rules for <I>using-declaration</I>s naming function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="566">
     <td><a href="https://cplusplus.github.io/CWG/issues/566.html">566</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>NAD</td>
     <td>Conversion of negative floating point values to integer type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="567">
     <td><a href="https://cplusplus.github.io/CWG/issues/567.html">567</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Can <TT>size_t</TT> and <TT>ptrdiff_t</TT> be larger than <TT>long</TT>?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="568">
     <td><a href="https://cplusplus.github.io/CWG/issues/568.html">568</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Definition of POD is too strict</td>
     <td class="full" align="center">Clang 3.0 (C++11 onwards)</td>
   </tr>
   <tr id="569">
     <td><a href="https://cplusplus.github.io/CWG/issues/569.html">569</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD2</td>
     <td>Spurious semicolons at namespace scope should be allowed</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="570">
     <td><a href="https://cplusplus.github.io/CWG/issues/570.html">570</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD2</td>
     <td>Are references subject to the ODR?</td>
     <td class="na" align="center">Duplicate of <a href="#633">633</a></td>
   </tr>
   <tr id="571">
     <td><a href="https://cplusplus.github.io/CWG/issues/571.html">571</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>References declared <TT>const</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="572">
     <td><a href="https://cplusplus.github.io/CWG/issues/572.html">572</a></td>
+    <td>[<a href="https://wg21.link/conv">conv</a>]</td>
     <td>C++11</td>
     <td>Standard conversions for non-built-in types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="573">
     <td><a href="https://cplusplus.github.io/CWG/issues/573.html">573</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>C++11</td>
     <td>Conversions between function pointers and <TT>void*</TT></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="574">
     <td><a href="https://cplusplus.github.io/CWG/issues/574.html">574</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Definition of &#8220;copy assignment operator&#8221;</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="575">
     <td><a href="https://cplusplus.github.io/CWG/issues/575.html">575</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Criteria for deduction failure</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="576">
     <td><a href="https://cplusplus.github.io/CWG/issues/576.html">576</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD2</td>
     <td>Typedefs in function definitions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="577">
     <td><a href="https://cplusplus.github.io/CWG/issues/577.html">577</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td><TT>void</TT> in an empty parameter list</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="578">
     <td><a href="https://cplusplus.github.io/CWG/issues/578.html">578</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD6</td>
     <td>Phase 1 replacement of characters with <I>universal-character-name</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="579">
     <td><a href="https://cplusplus.github.io/CWG/issues/579.html">579</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>What is a &#8220;nested&#8221; <TT>&gt;</TT> or <TT>&gt;&gt;</TT>?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="580">
     <td><a href="https://cplusplus.github.io/CWG/issues/580.html">580</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>C++11</td>
     <td>Access in <I>template-parameter</I>s of member and friend definitions</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="581">
     <td><a href="https://cplusplus.github.io/CWG/issues/581.html">581</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD5</td>
     <td>Can a templated constructor be explicitly instantiated or specialized?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="582">
     <td><a href="https://cplusplus.github.io/CWG/issues/582.html">582</a></td>
+    <td>[<a href="https://wg21.link/temp.mem">temp.mem</a>]</td>
     <td>CD1</td>
     <td>Template conversion functions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="583">
     <td><a href="https://cplusplus.github.io/CWG/issues/583.html">583</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD3</td>
     <td>Relational pointer comparisons against the null pointer constant</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="584">
     <td><a href="https://cplusplus.github.io/CWG/issues/584.html">584</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Unions and aliasing</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="585">
     <td><a href="https://cplusplus.github.io/CWG/issues/585.html">585</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Friend template template parameters</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="586">
     <td><a href="https://cplusplus.github.io/CWG/issues/586.html">586</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Default <I>template-argument</I>s and template argument deduction</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="587">
     <td><a href="https://cplusplus.github.io/CWG/issues/587.html">587</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD2</td>
     <td>Lvalue operands of a conditional expression differing only in cv-qualification</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="588">
     <td><a href="https://cplusplus.github.io/CWG/issues/588.html">588</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD2</td>
     <td>Searching dependent bases of classes local to function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="589">
     <td><a href="https://cplusplus.github.io/CWG/issues/589.html">589</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding of class and array rvalues in reference initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="590">
     <td><a href="https://cplusplus.github.io/CWG/issues/590.html">590</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Nested classes and the &#8220;current instantiation&#8221;</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="591">
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>When a dependent base class is the current instantiation</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="592">
     <td><a href="https://cplusplus.github.io/CWG/issues/592.html">592</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD1</td>
     <td>Exceptions during construction of local static objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="593">
     <td><a href="https://cplusplus.github.io/CWG/issues/593.html">593</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>NAD</td>
     <td>Falling off the end of a destructor's <I>function-try-block</I> handler</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="594">
     <td><a href="https://cplusplus.github.io/CWG/issues/594.html">594</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Coordinating issues 119 and 404 with delegating constructors</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="595">
     <td><a href="https://cplusplus.github.io/CWG/issues/595.html">595</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications in templates instantiated from class bodies</td>
     <td class="full" align="center">Duplicate of <a href="#1330">1330</a></td>
   </tr>
   <tr id="596">
     <td><a href="https://cplusplus.github.io/CWG/issues/596.html">596</a></td>
+    <td>[<a href="https://wg21.link/except.unexpected">except.unexpected</a>]</td>
     <td>NAD</td>
     <td>Replacing an exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="597">
     <td><a href="https://cplusplus.github.io/CWG/issues/597.html">597</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD3</td>
     <td>Conversions applied to out-of-lifetime non-POD lvalues</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="598">
     <td><a href="https://cplusplus.github.io/CWG/issues/598.html">598</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD2</td>
     <td>Associated namespaces of overloaded functions and function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="599">
     <td><a href="https://cplusplus.github.io/CWG/issues/599.html">599</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD2</td>
     <td>Deleting a null function pointer</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="600">
     <td><a href="https://cplusplus.github.io/CWG/issues/600.html">600</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD6</td>
     <td>Does access control apply to members or to names?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="601">
     <td><a href="https://cplusplus.github.io/CWG/issues/601.html">601</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD2</td>
     <td>Type of literals in preprocessing expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="602">
     <td><a href="https://cplusplus.github.io/CWG/issues/602.html">602</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>When is the injected-class-name of a class template a template?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="603">
     <td><a href="https://cplusplus.github.io/CWG/issues/603.html">603</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD1</td>
     <td>Type equivalence and unsigned overflow</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="604">
     <td><a href="https://cplusplus.github.io/CWG/issues/604.html">604</a></td>
+    <td>[<a href="https://wg21.link/over.match.ctor">over.match.ctor</a>]</td>
     <td>CD2</td>
     <td>Argument list for overload resolution in copy-initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="605">
     <td><a href="https://cplusplus.github.io/CWG/issues/605.html">605</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Linkage of explicit specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="606">
     <td><a href="https://cplusplus.github.io/CWG/issues/606.html">606</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Template argument deduction for rvalue references</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="607">
     <td><a href="https://cplusplus.github.io/CWG/issues/607.html">607</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD6</td>
     <td>Lookup of <I>mem-initializer-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="608">
     <td><a href="https://cplusplus.github.io/CWG/issues/608.html">608</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Determining the final overrider of a virtual function</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="609">
     <td><a href="https://cplusplus.github.io/CWG/issues/609.html">609</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>CD4</td>
     <td>What is a &#8220;top-level&#8221; cv-qualifier?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="610">
     <td><a href="https://cplusplus.github.io/CWG/issues/610.html">610</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Computing the negative of <TT>0U</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="611">
     <td><a href="https://cplusplus.github.io/CWG/issues/611.html">611</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD2</td>
     <td>Zero-initializing references</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="612">
     <td><a href="https://cplusplus.github.io/CWG/issues/612.html">612</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD2</td>
     <td>Requirements on a conforming implementation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="613">
     <td><a href="https://cplusplus.github.io/CWG/issues/613.html">613</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Unevaluated uses of non-static class members</td>
     <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="614">
     <td><a href="https://cplusplus.github.io/CWG/issues/614.html">614</a></td>
+    <td>[<a href="https://wg21.link/expr.mul">expr.mul</a>]</td>
     <td>CD1</td>
     <td>Results of integer <TT>/</TT> and <TT>%</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="615">
     <td><a href="https://cplusplus.github.io/CWG/issues/615.html">615</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of variables that can be initialized</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="616">
     <td><a href="https://cplusplus.github.io/CWG/issues/616.html">616</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;indeterminate value&#8221;</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="617">
     <td><a href="https://cplusplus.github.io/CWG/issues/617.html">617</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>NAD</td>
     <td>Lvalue-to-rvalue conversions of uninitialized <TT>char</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="618">
     <td><a href="https://cplusplus.github.io/CWG/issues/618.html">618</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD2</td>
     <td>Casts in preprocessor conditional expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="619">
     <td><a href="https://cplusplus.github.io/CWG/issues/619.html">619</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Completeness of array types</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="620">
     <td><a href="https://cplusplus.github.io/CWG/issues/620.html">620</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Declaration order in layout-compatible POD structs</td>
     <td class="full" align="center">Duplicate of <a href="#568">568</a></td>
   </tr>
   <tr id="621">
     <td><a href="https://cplusplus.github.io/CWG/issues/621.html">621</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Template argument deduction from function return types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="622">
     <td><a href="https://cplusplus.github.io/CWG/issues/622.html">622</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>NAD</td>
     <td>Relational comparisons of arbitrary pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="623">
     <td><a href="https://cplusplus.github.io/CWG/issues/623.html">623</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD3</td>
     <td>Use of pointers to deallocated storage</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="624">
     <td><a href="https://cplusplus.github.io/CWG/issues/624.html">624</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Overflow in calculating size of allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="625">
     <td><a href="https://cplusplus.github.io/CWG/issues/625.html">625</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="626">
     <td><a href="https://cplusplus.github.io/CWG/issues/626.html">626</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD2</td>
     <td>Preprocessor string literals</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="627">
     <td><a href="https://cplusplus.github.io/CWG/issues/627.html">627</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Values behaving as types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="628">
     <td><a href="https://cplusplus.github.io/CWG/issues/628.html">628</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD2</td>
     <td>The values of an enumeration with no enumerator</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="629">
     <td><a href="https://cplusplus.github.io/CWG/issues/629.html">629</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD1</td>
     <td><TT>auto</TT> parsing ambiguity</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="630">
     <td><a href="https://cplusplus.github.io/CWG/issues/630.html">630</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD2</td>
     <td>Equality of narrow and wide character values in the basic character set</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="631">
     <td><a href="https://cplusplus.github.io/CWG/issues/631.html">631</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>CD3</td>
     <td>Jumping into a &#8220;then&#8221; clause</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="632">
     <td><a href="https://cplusplus.github.io/CWG/issues/632.html">632</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Brace-enclosed initializer for scalar member of aggregate</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="633">
     <td><a href="https://cplusplus.github.io/CWG/issues/633.html">633</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Specifications for variables that should also apply to references</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="634">
     <td><a href="https://cplusplus.github.io/CWG/issues/634.html">634</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis redux</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="635">
     <td><a href="https://cplusplus.github.io/CWG/issues/635.html">635</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>NAD</td>
     <td>Names of constructors and destructors of templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="636">
     <td><a href="https://cplusplus.github.io/CWG/issues/636.html">636</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD4</td>
     <td>Dynamic type of objects and aliasing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="637">
     <td><a href="https://cplusplus.github.io/CWG/issues/637.html">637</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD1</td>
     <td>Sequencing rules and example disagree</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="638">
     <td><a href="https://cplusplus.github.io/CWG/issues/638.html">638</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD2</td>
     <td>Explicit specialization and friendship</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="639">
     <td><a href="https://cplusplus.github.io/CWG/issues/639.html">639</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD1</td>
     <td>What makes side effects &#8220;different&#8221; from one another?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="640">
     <td><a href="https://cplusplus.github.io/CWG/issues/640.html">640</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>NAD</td>
     <td>Accessing destroyed local objects of static storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="641">
     <td><a href="https://cplusplus.github.io/CWG/issues/641.html">641</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>CD2</td>
     <td>Overload resolution and conversion-to-same-type operators</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="642">
     <td><a href="https://cplusplus.github.io/CWG/issues/642.html">642</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD2</td>
     <td>Definition and use of &#8220;block scope&#8221; and &#8220;local scope&#8221;</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="643">
     <td><a href="https://cplusplus.github.io/CWG/issues/643.html">643</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>decltype</TT> in a class <I>member-specification</I></td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="644">
     <td><a href="https://cplusplus.github.io/CWG/issues/644.html">644</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD1</td>
     <td>Should a trivial class type be a literal type?</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="645">
     <td><a href="https://cplusplus.github.io/CWG/issues/645.html">645</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD2</td>
     <td>Are bit-field and non-bit-field members layout compatible?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="646">
     <td><a href="https://cplusplus.github.io/CWG/issues/646.html">646</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Can a class with a constexpr copy constructor be a literal type?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#981">981</a></td>
   </tr>
   <tr id="647">
     <td><a href="https://cplusplus.github.io/CWG/issues/647.html">647</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD1</td>
     <td>Non-constexpr instances of constexpr constructor templates</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="648">
     <td><a href="https://cplusplus.github.io/CWG/issues/648.html">648</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD1</td>
     <td>Constant expressions in constexpr initializers</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="649">
     <td><a href="https://cplusplus.github.io/CWG/issues/649.html">649</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD1</td>
     <td>Optionally ill-formed extended alignment requests</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="650">
     <td><a href="https://cplusplus.github.io/CWG/issues/650.html">650</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD2</td>
     <td>Order of destruction for temporaries bound to the returned value of a function</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="651">
     <td><a href="https://cplusplus.github.io/CWG/issues/651.html">651</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Problems in <TT>decltype</TT> specification and examples</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="652">
     <td><a href="https://cplusplus.github.io/CWG/issues/652.html">652</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Compile-time evaluation of floating-point expressions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="653">
     <td><a href="https://cplusplus.github.io/CWG/issues/653.html">653</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD2</td>
     <td>Copy assignment of unions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="654">
     <td><a href="https://cplusplus.github.io/CWG/issues/654.html">654</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD1</td>
     <td>Conversions to and from <TT>nullptr_t</TT></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1423">1423</a></td>
   </tr>
   <tr id="655">
     <td><a href="https://cplusplus.github.io/CWG/issues/655.html">655</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Initialization not specified for forwarding constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="656">
     <td><a href="https://cplusplus.github.io/CWG/issues/656.html">656</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding to the result of a conversion operator</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="657">
     <td><a href="https://cplusplus.github.io/CWG/issues/657.html">657</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD2</td>
     <td>Abstract class parameter in synthesized declaration</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="658">
     <td><a href="https://cplusplus.github.io/CWG/issues/658.html">658</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Defining <TT>reinterpret_cast</TT> for pointer types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="659">
     <td><a href="https://cplusplus.github.io/CWG/issues/659.html">659</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD1</td>
     <td>Alignment of function types</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="660">
     <td><a href="https://cplusplus.github.io/CWG/issues/660.html">660</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Unnamed scoped enumerations</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="661">
     <td><a href="https://cplusplus.github.io/CWG/issues/661.html">661</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD1</td>
     <td>Semantics of arithmetic comparisons</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="662">
     <td><a href="https://cplusplus.github.io/CWG/issues/662.html">662</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Forming a pointer to a reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="663">
     <td><a href="https://cplusplus.github.io/CWG/issues/663.html">663</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>CD1</td>
     <td>Valid Cyrillic identifier characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="664">
     <td><a href="https://cplusplus.github.io/CWG/issues/664.html">664</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding of references to non-class rvalue references</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="665">
     <td><a href="https://cplusplus.github.io/CWG/issues/665.html">665</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD2</td>
     <td>Problems in the specification of <TT>dynamic_cast</TT></td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="666">
     <td><a href="https://cplusplus.github.io/CWG/issues/666.html">666</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Dependent <I>qualified-id</I>s without the <TT>typename</TT> keyword</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="667">
     <td><a href="https://cplusplus.github.io/CWG/issues/667.html">667</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Trivial special member functions that cannot be implicitly defined</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="668">
     <td><a href="https://cplusplus.github.io/CWG/issues/668.html">668</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>CD2</td>
     <td>Throwing an exception from the destructor of a local static object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="669">
     <td><a href="https://cplusplus.github.io/CWG/issues/669.html">669</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>NAD</td>
     <td>Confusing specification of the meaning of <TT>decltype</TT></td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="670">
     <td><a href="https://cplusplus.github.io/CWG/issues/670.html">670</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Copy initialization via derived-to-base conversion in the second step</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="671">
     <td><a href="https://cplusplus.github.io/CWG/issues/671.html">671</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Explicit conversion from a scoped enumeration type to integral type</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="672">
     <td><a href="https://cplusplus.github.io/CWG/issues/672.html">672</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Sequencing of initialization in <I>new-expression</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="673">
     <td><a href="https://cplusplus.github.io/CWG/issues/673.html">673</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Injection of names from <I>elaborated-type-specifier</I>s in <TT>friend</TT> declarations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="674">
     <td><a href="https://cplusplus.github.io/CWG/issues/674.html">674</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>C++11</td>
     <td>&#8220;matching specialization&#8221; for a friend declaration</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="675">
     <td><a href="https://cplusplus.github.io/CWG/issues/675.html">675</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD3</td>
     <td>Signedness of bit-field with typedef or template parameter type</td>
     <td class="unknown" align="center">Duplicate of <a href="#739">739</a></td>
   </tr>
   <tr id="676">
     <td><a href="https://cplusplus.github.io/CWG/issues/676.html">676</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td><I>static_assert-declaration</I>s and general requirements for declarations</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="677">
     <td><a href="https://cplusplus.github.io/CWG/issues/677.html">677</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Deleted <TT>operator delete</TT> and virtual destructors</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="678">
     <td><a href="https://cplusplus.github.io/CWG/issues/678.html">678</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>Language linkage of member function parameter types and the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="679">
     <td><a href="https://cplusplus.github.io/CWG/issues/679.html">679</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD1</td>
     <td>Equivalence of <I>template-id</I>s and operator function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="680">
     <td><a href="https://cplusplus.github.io/CWG/issues/680.html">680</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>What is a move constructor?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="681">
     <td><a href="https://cplusplus.github.io/CWG/issues/681.html">681</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Restrictions on declarators with late-specified return types</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="682">
     <td><a href="https://cplusplus.github.io/CWG/issues/682.html">682</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD5</td>
     <td>Missing description of lookup of template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="683">
     <td><a href="https://cplusplus.github.io/CWG/issues/683.html">683</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD1</td>
     <td>Requirements for trivial subobject special functions</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="684">
     <td><a href="https://cplusplus.github.io/CWG/issues/684.html">684</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Constant expressions involving the address of an automatic variable</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1454">1454</a></td>
   </tr>
   <tr id="685">
     <td><a href="https://cplusplus.github.io/CWG/issues/685.html">685</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD2</td>
     <td>Integral promotion of enumeration ignores fixed underlying type</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="686">
     <td><a href="https://cplusplus.github.io/CWG/issues/686.html">686</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>CD1</td>
     <td>Type declarations/definitions in <I>type-specifier-seq</I>s and <I>type-id</I>s</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="687">
     <td><a href="https://cplusplus.github.io/CWG/issues/687.html">687</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>NAD</td>
     <td><TT>template</TT> keyword with <I>unqualified-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="688">
     <td><a href="https://cplusplus.github.io/CWG/issues/688.html">688</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Constexpr constructors and static initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="689">
     <td><a href="https://cplusplus.github.io/CWG/issues/689.html">689</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD5</td>
     <td>Maximum values of signed and unsigned integers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="690">
     <td><a href="https://cplusplus.github.io/CWG/issues/690.html">690</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD2</td>
     <td>The dynamic type of an rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="691">
     <td><a href="https://cplusplus.github.io/CWG/issues/691.html">691</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template parameter packs in class template partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="692">
     <td><a href="https://cplusplus.github.io/CWG/issues/692.html">692</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++11</td>
     <td>Partial ordering of variadic class template partial specializations</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="693">
     <td><a href="https://cplusplus.github.io/CWG/issues/693.html">693</a></td>
+    <td>[<a href="https://wg21.link/conv.array">conv.array</a>]</td>
     <td>CD2</td>
     <td>New string types and deprecated conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="694">
     <td><a href="https://cplusplus.github.io/CWG/issues/694.html">694</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Zero- and value-initialization of union objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="695">
     <td><a href="https://cplusplus.github.io/CWG/issues/695.html">695</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Compile-time calculation errors in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="696">
     <td><a href="https://cplusplus.github.io/CWG/issues/696.html">696</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>C++11</td>
     <td>Use of block-scope constants in local classes</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="697">
     <td><a href="https://cplusplus.github.io/CWG/issues/697.html">697</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Deduction rules apply to more than functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="698">
     <td><a href="https://cplusplus.github.io/CWG/issues/698.html">698</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>open</td>
     <td>The definition of &#8220;sequenced before&#8221; is too narrow</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="699">
     <td><a href="https://cplusplus.github.io/CWG/issues/699.html">699</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD2</td>
     <td>Must constexpr member functions be defined in the class <I>member-specification</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="700">
     <td><a href="https://cplusplus.github.io/CWG/issues/700.html">700</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr member functions of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="701">
     <td><a href="https://cplusplus.github.io/CWG/issues/701.html">701</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD2</td>
     <td>When is the array-to-pointer conversion applied?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="702">
     <td><a href="https://cplusplus.github.io/CWG/issues/702.html">702</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD2</td>
     <td>Preferring conversion to <TT>std::initializer_list</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="703">
     <td><a href="https://cplusplus.github.io/CWG/issues/703.html">703</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Narrowing for literals that cannot be exactly represented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="704">
     <td><a href="https://cplusplus.github.io/CWG/issues/704.html">704</a></td>
+    <td>[<a href="https://wg21.link/over.match.call">over.match.call</a>]</td>
     <td>CD2</td>
     <td>To which <I>postfix-expression</I>s does overload resolution apply?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="705">
     <td><a href="https://cplusplus.github.io/CWG/issues/705.html">705</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD2</td>
     <td>Suppressing argument-dependent lookup via parentheses</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="706">
     <td><a href="https://cplusplus.github.io/CWG/issues/706.html">706</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>auto</TT> with rvalue references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="707">
     <td><a href="https://cplusplus.github.io/CWG/issues/707.html">707</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>CD2</td>
     <td>Undefined behavior in integral-to-floating conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="708">
     <td><a href="https://cplusplus.github.io/CWG/issues/708.html">708</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>open</td>
     <td>Partial specialization of member templates of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="709">
     <td><a href="https://cplusplus.github.io/CWG/issues/709.html">709</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Enumeration names as <I>nested-name-specifier</I>s in deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="710">
     <td><a href="https://cplusplus.github.io/CWG/issues/710.html">710</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>CD2</td>
     <td>Data races during construction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="711">
     <td><a href="https://cplusplus.github.io/CWG/issues/711.html">711</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td><TT>auto</TT> with <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="712">
     <td><a href="https://cplusplus.github.io/CWG/issues/712.html">712</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Are integer constant operands of a <I>conditional-expression</I> &#8220;used?&#8221;</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="713">
     <td><a href="https://cplusplus.github.io/CWG/issues/713.html">713</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Unclear note about cv-qualified function types</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="714">
     <td><a href="https://cplusplus.github.io/CWG/issues/714.html">714</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD2</td>
     <td>Static const data members and <I>braced-init-list</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="715">
     <td><a href="https://cplusplus.github.io/CWG/issues/715.html">715</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Class member access constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="716">
     <td><a href="https://cplusplus.github.io/CWG/issues/716.html">716</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD2</td>
     <td>Specifications that should apply only to non-static union data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="717">
     <td><a href="https://cplusplus.github.io/CWG/issues/717.html">717</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Unintentional restrictions on the use of <TT>thread_local</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="718">
     <td><a href="https://cplusplus.github.io/CWG/issues/718.html">718</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Non-class, non-function friend declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="719">
     <td><a href="https://cplusplus.github.io/CWG/issues/719.html">719</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Specifications for <I>operator-function-id</I> that should also apply to <I>literal-operator-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="720">
     <td><a href="https://cplusplus.github.io/CWG/issues/720.html">720</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Need examples of <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="721">
     <td><a href="https://cplusplus.github.io/CWG/issues/721.html">721</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Where must a variable be initialized to be used in a constant expression?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="722">
     <td><a href="https://cplusplus.github.io/CWG/issues/722.html">722</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD2</td>
     <td>Can <TT>nullptr</TT> be passed to an ellipsis?</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="726">
     <td><a href="https://cplusplus.github.io/CWG/issues/726.html">726</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Atomic and non-atomic objects in the memory model</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="727">
     <td><a href="https://cplusplus.github.io/CWG/issues/727.html">727</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++17</td>
     <td>In-class explicit specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="728">
     <td><a href="https://cplusplus.github.io/CWG/issues/728.html">728</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>Restrictions on local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="729">
     <td><a href="https://cplusplus.github.io/CWG/issues/729.html">729</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD3</td>
     <td>Qualification conversions and handlers of reference-to-pointer type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="730">
     <td><a href="https://cplusplus.github.io/CWG/issues/730.html">730</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Explicit specializations of members of non-template classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="731">
     <td><a href="https://cplusplus.github.io/CWG/issues/731.html">731</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD2</td>
     <td>Omitted reference qualification of member function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="732">
     <td><a href="https://cplusplus.github.io/CWG/issues/732.html">732</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Late-specified return types in function definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="733">
     <td><a href="https://cplusplus.github.io/CWG/issues/733.html">733</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Reference qualification of copy assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="734">
     <td><a href="https://cplusplus.github.io/CWG/issues/734.html">734</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Are unique addresses required for namespace-scope variables?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="735">
     <td><a href="https://cplusplus.github.io/CWG/issues/735.html">735</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD2</td>
     <td>Missing case in specification of safely-derived pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="736">
     <td><a href="https://cplusplus.github.io/CWG/issues/736.html">736</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>NAD</td>
     <td>Is the <TT>&amp;</TT> <I>ref-qualifier</I> needed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="737">
     <td><a href="https://cplusplus.github.io/CWG/issues/737.html">737</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>CD2</td>
     <td>Uninitialized trailing characters in string initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="738">
     <td><a href="https://cplusplus.github.io/CWG/issues/738.html">738</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> not permitted by the syntax of constructor declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="739">
     <td><a href="https://cplusplus.github.io/CWG/issues/739.html">739</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD3</td>
     <td>Signedness of plain bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="740">
     <td><a href="https://cplusplus.github.io/CWG/issues/740.html">740</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Incorrect note on data races</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="741">
     <td><a href="https://cplusplus.github.io/CWG/issues/741.html">741</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>C++11</td>
     <td>&#8220;plain&#8221; <TT>long long</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="742">
     <td><a href="https://cplusplus.github.io/CWG/issues/742.html">742</a></td>
+    <td>[<a href="https://wg21.link/expr.post.incr">expr.post.incr</a>]</td>
     <td>open</td>
     <td>Postfix increment/decrement with long bit-field operands</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="743">
     <td><a href="https://cplusplus.github.io/CWG/issues/743.html">743</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>decltype</TT> in a <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="744">
     <td><a href="https://cplusplus.github.io/CWG/issues/744.html">744</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>CD2</td>
     <td>Matching template arguments with template template parameters with parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="745">
     <td><a href="https://cplusplus.github.io/CWG/issues/745.html">745</a></td>
+    <td>[<a href="https://wg21.link/cpp.error">cpp.error</a>]</td>
     <td>C++23</td>
     <td>Effect of ill-formedness resulting from <TT>#error</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="746">
     <td><a href="https://cplusplus.github.io/CWG/issues/746.html">746</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="747">
     <td><a href="https://cplusplus.github.io/CWG/issues/747.html">747</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>dup</td>
     <td>Access of protected base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="749">
     <td><a href="https://cplusplus.github.io/CWG/issues/749.html">749</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>References to function types with a <I>cv-qualifier</I> or <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="750">
     <td><a href="https://cplusplus.github.io/CWG/issues/750.html">750</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Implementation constraints on reference-only closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="751">
     <td><a href="https://cplusplus.github.io/CWG/issues/751.html">751</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Deriving from closure classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="752">
     <td><a href="https://cplusplus.github.io/CWG/issues/752.html">752</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Name lookup in nested <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="753">
     <td><a href="https://cplusplus.github.io/CWG/issues/753.html">753</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Array names in lambda capture sets</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="754">
     <td><a href="https://cplusplus.github.io/CWG/issues/754.html">754</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lambda expressions in default arguments of block-scope function declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="755">
     <td><a href="https://cplusplus.github.io/CWG/issues/755.html">755</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD3</td>
     <td>Generalized <I>lambda-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="756">
     <td><a href="https://cplusplus.github.io/CWG/issues/756.html">756</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Dropping cv-qualification on members of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="757">
     <td><a href="https://cplusplus.github.io/CWG/issues/757.html">757</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Types without linkage in declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="758">
     <td><a href="https://cplusplus.github.io/CWG/issues/758.html">758</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td>Missing cases of declarations that are not definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="759">
     <td><a href="https://cplusplus.github.io/CWG/issues/759.html">759</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Destruction of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="760">
     <td><a href="https://cplusplus.github.io/CWG/issues/760.html">760</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td><TT>this</TT> inside a nested class of a non-static member function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="761">
     <td><a href="https://cplusplus.github.io/CWG/issues/761.html">761</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Inferred return type of closure object call operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="762">
     <td><a href="https://cplusplus.github.io/CWG/issues/762.html">762</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Name lookup in the <I>compound-statement</I> of a lambda expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="763">
     <td><a href="https://cplusplus.github.io/CWG/issues/763.html">763</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Is a closure object's <TT>operator()</TT> inline?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="764">
     <td><a href="https://cplusplus.github.io/CWG/issues/764.html">764</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Capturing unused variables in a lambda expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="765">
     <td><a href="https://cplusplus.github.io/CWG/issues/765.html">765</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD2</td>
     <td>Local types in inline functions with external linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="766">
     <td><a href="https://cplusplus.github.io/CWG/issues/766.html">766</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Where may lambda expressions appear?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="767">
     <td><a href="https://cplusplus.github.io/CWG/issues/767.html">767</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td><TT>void</TT> and other unnamed <I>lambda-parameter</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="768">
     <td><a href="https://cplusplus.github.io/CWG/issues/768.html">768</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Ellipsis in a lambda parameter list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="769">
     <td><a href="https://cplusplus.github.io/CWG/issues/769.html">769</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Initialization of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="770">
     <td><a href="https://cplusplus.github.io/CWG/issues/770.html">770</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD2</td>
     <td>Ambiguity in late-specified return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="771">
     <td><a href="https://cplusplus.github.io/CWG/issues/771.html">771</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Move-construction of reference members of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="772">
     <td><a href="https://cplusplus.github.io/CWG/issues/772.html">772</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td><I>capture-default</I> in lambdas in local default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="773">
     <td><a href="https://cplusplus.github.io/CWG/issues/773.html">773</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Parentheses in address non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="774">
     <td><a href="https://cplusplus.github.io/CWG/issues/774.html">774</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Can a closure class be a POD?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="775">
     <td><a href="https://cplusplus.github.io/CWG/issues/775.html">775</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Capturing references to functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="776">
     <td><a href="https://cplusplus.github.io/CWG/issues/776.html">776</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD2</td>
     <td>Delegating constructors, destructors, and <TT>std::exit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="777">
     <td><a href="https://cplusplus.github.io/CWG/issues/777.html">777</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD2</td>
     <td>Default arguments and parameter packs</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="778">
     <td><a href="https://cplusplus.github.io/CWG/issues/778.html">778</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template parameter packs in non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="779">
     <td><a href="https://cplusplus.github.io/CWG/issues/779.html">779</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Rvalue reference members of closure objects?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="782">
     <td><a href="https://cplusplus.github.io/CWG/issues/782.html">782</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lambda expressions and argument-dependent lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="783">
     <td><a href="https://cplusplus.github.io/CWG/issues/783.html">783</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>open</td>
     <td>Definition of &#8220;argument&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="784">
     <td><a href="https://cplusplus.github.io/CWG/issues/784.html">784</a></td>
+    <td>[<a href="https://wg21.link/intro.structure">intro.structure</a>]</td>
     <td>C++11</td>
     <td>List of incompatibilities with the previous Standard</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="785">
     <td><a href="https://cplusplus.github.io/CWG/issues/785.html">785</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD2</td>
     <td>&#8220;Execution sequence&#8221; is inappropriate phraseology</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="786">
     <td><a href="https://cplusplus.github.io/CWG/issues/786.html">786</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Definition of &#8220;thread&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="787">
     <td><a href="https://cplusplus.github.io/CWG/issues/787.html">787</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD2</td>
     <td>Unnecessary lexical undefined behavior</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="788">
     <td><a href="https://cplusplus.github.io/CWG/issues/788.html">788</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD2</td>
     <td>Relationship between locale and values of the execution character set</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="789">
     <td><a href="https://cplusplus.github.io/CWG/issues/789.html">789</a></td>
+    <td>[<a href="https://wg21.link/lex.trigraph">lex.trigraph</a>]</td>
     <td>CD2</td>
     <td>Deprecating trigraphs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="790">
     <td><a href="https://cplusplus.github.io/CWG/issues/790.html">790</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>Concatenation of raw and non-raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="792">
     <td><a href="https://cplusplus.github.io/CWG/issues/792.html">792</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD2</td>
     <td>Effects of <TT>std::quick_exit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="793">
     <td><a href="https://cplusplus.github.io/CWG/issues/793.html">793</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD2</td>
     <td>Use of class members during destruction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="794">
     <td><a href="https://cplusplus.github.io/CWG/issues/794.html">794</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>NAD</td>
     <td>Base-derived conversion in member type of pointer-to-member conversion</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="795">
     <td><a href="https://cplusplus.github.io/CWG/issues/795.html">795</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>NAD</td>
     <td>Dependency of lambdas on <TT>&lt;functional&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="796">
     <td><a href="https://cplusplus.github.io/CWG/issues/796.html">796</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lifetime of a closure object with members captured by reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="797">
     <td><a href="https://cplusplus.github.io/CWG/issues/797.html">797</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Converting a no-capture lambda to a function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="798">
     <td><a href="https://cplusplus.github.io/CWG/issues/798.html">798</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>C++11</td>
     <td>Overloaded subscript operator described in clause 5</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="799">
     <td><a href="https://cplusplus.github.io/CWG/issues/799.html">799</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Can <TT>reinterpret_cast</TT> be used to cast an operand to its own type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="800">
     <td><a href="https://cplusplus.github.io/CWG/issues/800.html">800</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td>Safely-derived pointers and object pointers converted from function pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="801">
     <td><a href="https://cplusplus.github.io/CWG/issues/801.html">801</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD2</td>
     <td>Casting away constness in a cast to rvalue reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="803">
     <td><a href="https://cplusplus.github.io/CWG/issues/803.html">803</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>CD2</td>
     <td><TT>sizeof</TT> an enumeration type with a fixed underlying type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="804">
     <td><a href="https://cplusplus.github.io/CWG/issues/804.html">804</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Deducing the type in <TT>new auto(x)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="805">
     <td><a href="https://cplusplus.github.io/CWG/issues/805.html">805</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Which exception to throw for overflow in array size calculation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="806">
     <td><a href="https://cplusplus.github.io/CWG/issues/806.html">806</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Enumeration types in integral constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="807">
     <td><a href="https://cplusplus.github.io/CWG/issues/807.html">807</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>typeid</TT> expressions in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="808">
     <td><a href="https://cplusplus.github.io/CWG/issues/808.html">808</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec">dcl.spec</a>]</td>
     <td>CD2</td>
     <td>Non-type <I>decl-specifier</I>s versus max-munch</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="809">
     <td><a href="https://cplusplus.github.io/CWG/issues/809.html">809</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Deprecation of the <TT>register</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="810">
     <td><a href="https://cplusplus.github.io/CWG/issues/810.html">810</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Block-scope <TT>thread_local</TT> variables should be implicitly <TT>static</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="811">
     <td><a href="https://cplusplus.github.io/CWG/issues/811.html">811</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>CD2</td>
     <td>Unclear implications of const-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="812">
     <td><a href="https://cplusplus.github.io/CWG/issues/812.html">812</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Duplicate names in inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="813">
     <td><a href="https://cplusplus.github.io/CWG/issues/813.html">813</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td><TT>typename</TT> in a <I>using-declaration</I> with a non-dependent name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="814">
     <td><a href="https://cplusplus.github.io/CWG/issues/814.html">814</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Attribute to indicate that a function throws nothing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="815">
     <td><a href="https://cplusplus.github.io/CWG/issues/815.html">815</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Parameter pack expansion inside attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="816">
     <td><a href="https://cplusplus.github.io/CWG/issues/816.html">816</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.final">dcl.attr.final</a>]</td>
     <td>CD2</td>
     <td>Diagnosing violations of <TT>[[final]]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="817">
     <td><a href="https://cplusplus.github.io/CWG/issues/817.html">817</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.final">dcl.attr.final</a>]</td>
     <td>CD2</td>
     <td>Meaning of <TT>[[final]]</TT> applied to a class definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="818">
     <td><a href="https://cplusplus.github.io/CWG/issues/818.html">818</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Function parameter packs in non-final positions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="819">
     <td><a href="https://cplusplus.github.io/CWG/issues/819.html">819</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>NAD</td>
     <td>Access control and deleted implicitly-declared special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="820">
     <td><a href="https://cplusplus.github.io/CWG/issues/820.html">820</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD2</td>
     <td>Deprecation of <TT>export</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="822">
     <td><a href="https://cplusplus.github.io/CWG/issues/822.html">822</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>Additional contexts for template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="823">
     <td><a href="https://cplusplus.github.io/CWG/issues/823.html">823</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD2</td>
     <td>Literal types with constexpr conversions as non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="828">
     <td><a href="https://cplusplus.github.io/CWG/issues/828.html">828</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD2</td>
     <td>Destruction of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="829">
     <td><a href="https://cplusplus.github.io/CWG/issues/829.html">829</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>At what point is <TT>std::unexpected</TT> called?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="830">
     <td><a href="https://cplusplus.github.io/CWG/issues/830.html">830</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD2</td>
     <td>Deprecating exception specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="831">
     <td><a href="https://cplusplus.github.io/CWG/issues/831.html">831</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD2</td>
     <td>Limit on recursively nested template instantiations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="832">
     <td><a href="https://cplusplus.github.io/CWG/issues/832.html">832</a></td>
+    <td>[<a href="https://wg21.link/lex.ppnumber">lex.ppnumber</a>]</td>
     <td>CD2</td>
     <td>Value of preprocessing numbers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="833">
     <td><a href="https://cplusplus.github.io/CWG/issues/833.html">833</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD2</td>
     <td>Explicit conversion of a scoped enumeration value to a floating type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="834">
     <td><a href="https://cplusplus.github.io/CWG/issues/834.html">834</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>What is an &#8220;ordinary string literal&#8221;?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="835">
     <td><a href="https://cplusplus.github.io/CWG/issues/835.html">835</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Scoped enumerations and the &#8220;usual arithmetic conversions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="836">
     <td><a href="https://cplusplus.github.io/CWG/issues/836.html">836</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.noreturn">dcl.attr.noreturn</a>]</td>
     <td>NAD</td>
     <td><TT>[[noreturn]]</TT> applied to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="837">
     <td><a href="https://cplusplus.github.io/CWG/issues/837.html">837</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr functions and <TT>return</TT> <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="838">
     <td><a href="https://cplusplus.github.io/CWG/issues/838.html">838</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Use of <TT>this</TT> in a <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="839">
     <td><a href="https://cplusplus.github.io/CWG/issues/839.html">839</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>dup</td>
     <td><TT>sizeof</TT> with opaque enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="840">
     <td><a href="https://cplusplus.github.io/CWG/issues/840.html">840</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD2</td>
     <td>Rvalue references as nontype template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="842">
     <td><a href="https://cplusplus.github.io/CWG/issues/842.html">842</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Casting to rvalue reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="845">
     <td><a href="https://cplusplus.github.io/CWG/issues/845.html">845</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>What is the &#8220;first declaration&#8221; of an explicit specialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="846">
     <td><a href="https://cplusplus.github.io/CWG/issues/846.html">846</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD2</td>
     <td>Rvalue references to functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="847">
     <td><a href="https://cplusplus.github.io/CWG/issues/847.html">847</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD2</td>
     <td>Error in rvalue reference deduction example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="850">
     <td><a href="https://cplusplus.github.io/CWG/issues/850.html">850</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td>Restrictions on use of non-static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="852">
     <td><a href="https://cplusplus.github.io/CWG/issues/852.html">852</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s and dependent base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="853">
     <td><a href="https://cplusplus.github.io/CWG/issues/853.html">853</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD2</td>
     <td>Support for relaxed pointer safety</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="854">
     <td><a href="https://cplusplus.github.io/CWG/issues/854.html">854</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD2</td>
     <td>Left shift and unsigned extended types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="855">
     <td><a href="https://cplusplus.github.io/CWG/issues/855.html">855</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD2</td>
     <td>Incorrect comments in <I>braced-init-list</I> assignment example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="858">
     <td><a href="https://cplusplus.github.io/CWG/issues/858.html">858</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Example binding an rvalue reference to an lvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="860">
     <td><a href="https://cplusplus.github.io/CWG/issues/860.html">860</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Explicit qualification of constexpr member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="861">
     <td><a href="https://cplusplus.github.io/CWG/issues/861.html">861</a></td>
+    <td>[<a href="https://wg21.link/namespace.qual">namespace.qual</a>]</td>
     <td>CD2</td>
     <td>Unintended ambiguity in inline namespace lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="862">
     <td><a href="https://cplusplus.github.io/CWG/issues/862.html">862</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD2</td>
     <td>Undefined behavior with enumerator value overflow</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="863">
     <td><a href="https://cplusplus.github.io/CWG/issues/863.html">863</a></td>
+    <td>[<a href="https://wg21.link/expr.post">expr.post</a>]</td>
     <td>CD2</td>
     <td>Rvalue reference cast to incomplete type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="864">
     <td><a href="https://cplusplus.github.io/CWG/issues/864.html">864</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>C++11</td>
     <td><I>braced-init-list</I> in the range-based <TT>for</TT> statement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="865">
     <td><a href="https://cplusplus.github.io/CWG/issues/865.html">865</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Initializing a <TT>std::initializer_list</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="869">
     <td><a href="https://cplusplus.github.io/CWG/issues/869.html">869</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD2</td>
     <td>Uninitialized <TT>thread_local</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="872">
     <td><a href="https://cplusplus.github.io/CWG/issues/872.html">872</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>Lexical issues with raw strings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="873">
     <td><a href="https://cplusplus.github.io/CWG/issues/873.html">873</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++11</td>
     <td>Deducing rvalue references in declarative contexts</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="874">
     <td><a href="https://cplusplus.github.io/CWG/issues/874.html">874</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD2</td>
     <td>Class-scope definitions of enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="876">
     <td><a href="https://cplusplus.github.io/CWG/issues/876.html">876</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD2</td>
     <td>Type references in rvalue reference deduction specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="877">
     <td><a href="https://cplusplus.github.io/CWG/issues/877.html">877</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>CD2</td>
     <td>Viable functions and binding references to rvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="879">
     <td><a href="https://cplusplus.github.io/CWG/issues/879.html">879</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>Missing built-in comparison operators for pointer types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="880">
     <td><a href="https://cplusplus.github.io/CWG/issues/880.html">880</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>Built-in conditional operator for scoped enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="882">
     <td><a href="https://cplusplus.github.io/CWG/issues/882.html">882</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD2</td>
     <td>Defining <TT>main</TT> as deleted</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="883">
     <td><a href="https://cplusplus.github.io/CWG/issues/883.html">883</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD2</td>
     <td><TT>std::memcpy</TT> vs <TT>std::memmove</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="884">
     <td><a href="https://cplusplus.github.io/CWG/issues/884.html">884</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Defining an explicitly-specialized static data member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="885">
     <td><a href="https://cplusplus.github.io/CWG/issues/885.html">885</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>NAD</td>
     <td>Partial ordering of function templates with unordered parameter pairs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="886">
     <td><a href="https://cplusplus.github.io/CWG/issues/886.html">886</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD2</td>
     <td>Member initializers and aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="887">
     <td><a href="https://cplusplus.github.io/CWG/issues/887.html">887</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Move construction of thrown object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="888">
     <td><a href="https://cplusplus.github.io/CWG/issues/888.html">888</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD2</td>
     <td>Union member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="891">
     <td><a href="https://cplusplus.github.io/CWG/issues/891.html">891</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD2</td>
     <td><TT>const_cast</TT> to rvalue reference from objectless rvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="892">
     <td><a href="https://cplusplus.github.io/CWG/issues/892.html">892</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Missing requirements for constexpr constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="893">
     <td><a href="https://cplusplus.github.io/CWG/issues/893.html">893</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>Brace syntax for <I>enumerator-definition</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="896">
     <td><a href="https://cplusplus.github.io/CWG/issues/896.html">896</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Rvalue references and rvalue-reference conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="897">
     <td><a href="https://cplusplus.github.io/CWG/issues/897.html">897</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td><TT>_Pragma</TT> and extended <I>string-literal</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="898">
     <td><a href="https://cplusplus.github.io/CWG/issues/898.html">898</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Declarations in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="899">
     <td><a href="https://cplusplus.github.io/CWG/issues/899.html">899</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD2</td>
     <td>Explicit conversion functions in direct class initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="900">
     <td><a href="https://cplusplus.github.io/CWG/issues/900.html">900</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>C++23</td>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="901">
     <td><a href="https://cplusplus.github.io/CWG/issues/901.html">901</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Deleted <TT>operator delete</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="902">
     <td><a href="https://cplusplus.github.io/CWG/issues/902.html">902</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>In-class initialization of non-constant static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="903">
     <td><a href="https://cplusplus.github.io/CWG/issues/903.html">903</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD3</td>
     <td>Value-dependent integral null pointer constants</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="904">
     <td><a href="https://cplusplus.github.io/CWG/issues/904.html">904</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Parameter packs in <I>lambda-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="905">
     <td><a href="https://cplusplus.github.io/CWG/issues/905.html">905</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD2</td>
     <td>Explicit defaulted copy constructors and trivial copyability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="906">
     <td><a href="https://cplusplus.github.io/CWG/issues/906.html">906</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Which special member functions can be defaulted?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="908">
     <td><a href="https://cplusplus.github.io/CWG/issues/908.html">908</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Deleted global allocation and deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="909">
     <td><a href="https://cplusplus.github.io/CWG/issues/909.html">909</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>NAD</td>
     <td>Old-style casts with conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="910">
     <td><a href="https://cplusplus.github.io/CWG/issues/910.html">910</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Move constructors and implicitly-declared copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="912">
     <td><a href="https://cplusplus.github.io/CWG/issues/912.html">912</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD3</td>
     <td>Character literals and <I>universal-character-name</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="913">
     <td><a href="https://cplusplus.github.io/CWG/issues/913.html">913</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Deduction rules for array- and function-type conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="914">
     <td><a href="https://cplusplus.github.io/CWG/issues/914.html">914</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>open</td>
     <td>Value-initialization of array types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="915">
     <td><a href="https://cplusplus.github.io/CWG/issues/915.html">915</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Deleted specializations of member function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="919">
     <td><a href="https://cplusplus.github.io/CWG/issues/919.html">919</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Contradictions regarding inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="920">
     <td><a href="https://cplusplus.github.io/CWG/issues/920.html">920</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD2</td>
     <td>Interaction of inline namespaces and <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="921">
     <td><a href="https://cplusplus.github.io/CWG/issues/921.html">921</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Unclear specification of inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="922">
     <td><a href="https://cplusplus.github.io/CWG/issues/922.html">922</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD2</td>
     <td>Implicit default constructor definitions and <TT>const</TT> variant members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="923">
     <td><a href="https://cplusplus.github.io/CWG/issues/923.html">923</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Inline explicit specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="924">
     <td><a href="https://cplusplus.github.io/CWG/issues/924.html">924</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td><I>alias-declaration</I> as a class member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="925">
     <td><a href="https://cplusplus.github.io/CWG/issues/925.html">925</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Type of character literals in preprocessor expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="926">
     <td><a href="https://cplusplus.github.io/CWG/issues/926.html">926</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>CD2</td>
     <td>Inline unnamed namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="927">
     <td><a href="https://cplusplus.github.io/CWG/issues/927.html">927</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD2</td>
     <td>Implicitly-deleted default constructors and member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="928">
     <td><a href="https://cplusplus.github.io/CWG/issues/928.html">928</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Defaulting a function that would be implicitly defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="929">
     <td><a href="https://cplusplus.github.io/CWG/issues/929.html">929</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD2</td>
     <td>What is a template alias?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="930">
     <td><a href="https://cplusplus.github.io/CWG/issues/930.html">930</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD2</td>
     <td><TT>alignof</TT> with incomplete array type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="931">
     <td><a href="https://cplusplus.github.io/CWG/issues/931.html">931</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>CD2</td>
     <td>Confusing reference to the length of a user-defined string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="932">
     <td><a href="https://cplusplus.github.io/CWG/issues/932.html">932</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>UCNs in closing delimiters of raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="933">
     <td><a href="https://cplusplus.github.io/CWG/issues/933.html">933</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD2</td>
     <td>32-bit UCNs with 16-bit <TT>wchar_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="934">
     <td><a href="https://cplusplus.github.io/CWG/issues/934.html">934</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>List-initialization of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="935">
     <td><a href="https://cplusplus.github.io/CWG/issues/935.html">935</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD2</td>
     <td>Missing overloads for character types for user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="936">
     <td><a href="https://cplusplus.github.io/CWG/issues/936.html">936</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>CD2</td>
     <td>Array initialization with new string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="937">
     <td><a href="https://cplusplus.github.io/CWG/issues/937.html">937</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Restrictions on values of template arguments in user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="938">
     <td><a href="https://cplusplus.github.io/CWG/issues/938.html">938</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Initializer lists and array new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="939">
     <td><a href="https://cplusplus.github.io/CWG/issues/939.html">939</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Explicitly checking virtual function overriding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="940">
     <td><a href="https://cplusplus.github.io/CWG/issues/940.html">940</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Global anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="941">
     <td><a href="https://cplusplus.github.io/CWG/issues/941.html">941</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Explicit specialization of deleted function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="942">
     <td><a href="https://cplusplus.github.io/CWG/issues/942.html">942</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Is <TT>this</TT> an entity?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="943">
     <td><a href="https://cplusplus.github.io/CWG/issues/943.html">943</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td>Is <TT>T()</TT> a temporary?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="944">
     <td><a href="https://cplusplus.github.io/CWG/issues/944.html">944</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> for all types with the same size and alignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="945">
     <td><a href="https://cplusplus.github.io/CWG/issues/945.html">945</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>C++11</td>
     <td>Use of <TT>this</TT> in a late-specified return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="946">
     <td><a href="https://cplusplus.github.io/CWG/issues/946.html">946</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD2</td>
     <td>Order of destruction of local static objects and calls to <TT>std::atexit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="947">
     <td><a href="https://cplusplus.github.io/CWG/issues/947.html">947</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>NAD</td>
     <td>Deducing type template arguments from default function arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="948">
     <td><a href="https://cplusplus.github.io/CWG/issues/948.html">948</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> in <I>condition</I>s</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr class="open" id="949">
     <td><a href="https://cplusplus.github.io/CWG/issues/949.html">949</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance">intro.compliance</a>]</td>
     <td>open</td>
     <td>Requirements for freestanding implementations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="950">
     <td><a href="https://cplusplus.github.io/CWG/issues/950.html">950</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>decltype</TT> as a <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="951">
     <td><a href="https://cplusplus.github.io/CWG/issues/951.html">951</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Problems with <I>attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="952">
     <td><a href="https://cplusplus.github.io/CWG/issues/952.html">952</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD6</td>
     <td>Insufficient description of &#8220;naming class&#8221;</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="953">
     <td><a href="https://cplusplus.github.io/CWG/issues/953.html">953</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD2</td>
     <td>Rvalue references and function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="954">
     <td><a href="https://cplusplus.github.io/CWG/issues/954.html">954</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>open</td>
     <td>Overload resolution of conversion operator templates with built-in types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="955">
     <td><a href="https://cplusplus.github.io/CWG/issues/955.html">955</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Can a closure type's <TT>operator()</TT> be virtual?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="956">
     <td><a href="https://cplusplus.github.io/CWG/issues/956.html">956</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Function prototype scope with late-specified return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="957">
     <td><a href="https://cplusplus.github.io/CWG/issues/957.html">957</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Alternative tokens and <I>attribute-token</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="958">
     <td><a href="https://cplusplus.github.io/CWG/issues/958.html">958</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>NAD</td>
     <td>Lambdas and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="959">
     <td><a href="https://cplusplus.github.io/CWG/issues/959.html">959</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD2</td>
     <td>Alignment attribute for class and enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="960">
     <td><a href="https://cplusplus.github.io/CWG/issues/960.html">960</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Covariant functions and lvalue/rvalue references</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="961">
     <td><a href="https://cplusplus.github.io/CWG/issues/961.html">961</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD2</td>
     <td>Overload resolution and conversion of <TT>std::nullptr_t</TT> to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="962">
     <td><a href="https://cplusplus.github.io/CWG/issues/962.html">962</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD2</td>
     <td>Attributes appertaining to class and enum types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="963">
     <td><a href="https://cplusplus.github.io/CWG/issues/963.html">963</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD2</td>
     <td>Comparing <TT>nullptr</TT> with 0</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="964">
     <td><a href="https://cplusplus.github.io/CWG/issues/964.html">964</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of when the lvalue-to-rvalue conversion applies</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="965">
     <td><a href="https://cplusplus.github.io/CWG/issues/965.html">965</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.depend">dcl.attr.depend</a>]</td>
     <td>CD2</td>
     <td>Limiting the applicability of the <TT>carries_dependency</TT> attribute</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="966">
     <td><a href="https://cplusplus.github.io/CWG/issues/966.html">966</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Nested types without linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="967">
     <td><a href="https://cplusplus.github.io/CWG/issues/967.html">967</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic">basic.stc.dynamic</a>]</td>
     <td>NAD</td>
     <td>Exception specification of replacement allocation function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="968">
     <td><a href="https://cplusplus.github.io/CWG/issues/968.html">968</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Syntactic ambiguity of the attribute notation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="969">
     <td><a href="https://cplusplus.github.io/CWG/issues/969.html">969</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Explicit instantiation declarations of class template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="970">
     <td><a href="https://cplusplus.github.io/CWG/issues/970.html">970</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Consistent use of &#8220;appertain&#8221; and &#8220;apply&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="971">
     <td><a href="https://cplusplus.github.io/CWG/issues/971.html">971</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td>Incorrect treatment of <I>exception-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="972">
     <td><a href="https://cplusplus.github.io/CWG/issues/972.html">972</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++11</td>
     <td>Allowing multiple <I>attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="973">
     <td><a href="https://cplusplus.github.io/CWG/issues/973.html">973</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD2</td>
     <td>Function types in <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="974">
     <td><a href="https://cplusplus.github.io/CWG/issues/974.html">974</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td>Default arguments for lambdas</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="975">
     <td><a href="https://cplusplus.github.io/CWG/issues/975.html">975</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td>Restrictions on return type deduction for lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="976">
     <td><a href="https://cplusplus.github.io/CWG/issues/976.html">976</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Deduction for <TT>const T&amp;</TT> conversion operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="977">
     <td><a href="https://cplusplus.github.io/CWG/issues/977.html">977</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD3</td>
     <td>When is an enumeration type complete?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="978">
     <td><a href="https://cplusplus.github.io/CWG/issues/978.html">978</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD2</td>
     <td>Incorrect specification for copy initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="979">
     <td><a href="https://cplusplus.github.io/CWG/issues/979.html">979</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD2</td>
     <td>Position of <I>attribute-specifier</I> in declarator syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="980">
     <td><a href="https://cplusplus.github.io/CWG/issues/980.html">980</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Explicit instantiation of a member of a class template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="981">
     <td><a href="https://cplusplus.github.io/CWG/issues/981.html">981</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Constexpr constructor templates and literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="982">
     <td><a href="https://cplusplus.github.io/CWG/issues/982.html">982</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Initialization with an empty initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="983">
     <td><a href="https://cplusplus.github.io/CWG/issues/983.html">983</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD2</td>
     <td>Ambiguous pointer-to-member constant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="984">
     <td><a href="https://cplusplus.github.io/CWG/issues/984.html">984</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>&#8220;Deduced type&#8221; is unclear in <TT>auto</TT> type deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="985">
     <td><a href="https://cplusplus.github.io/CWG/issues/985.html">985</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>C++11</td>
     <td>Alternative tokens and user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="986">
     <td><a href="https://cplusplus.github.io/CWG/issues/986.html">986</a></td>
+    <td>[<a href="https://wg21.link/namespace.udir">namespace.udir</a>]</td>
     <td>CD2</td>
     <td>Transitivity of <I>using-directive</I>s versus qualified lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="987">
     <td><a href="https://cplusplus.github.io/CWG/issues/987.html">987</a></td>
+    <td>[<a href="https://wg21.link/basic.namespace">basic.namespace</a>]</td>
     <td>CD4</td>
     <td>Which declarations introduce namespace members?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="988">
     <td><a href="https://cplusplus.github.io/CWG/issues/988.html">988</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD2</td>
     <td>Reference-to-reference collapsing with <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="989">
     <td><a href="https://cplusplus.github.io/CWG/issues/989.html">989</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Misplaced list-initialization example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="990">
     <td><a href="https://cplusplus.github.io/CWG/issues/990.html">990</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Value initialization with multiple initializer-list constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="991">
     <td><a href="https://cplusplus.github.io/CWG/issues/991.html">991</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD2</td>
     <td>Reference parameters of constexpr functions and constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="992">
     <td><a href="https://cplusplus.github.io/CWG/issues/992.html">992</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Inheriting explicitness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="993">
     <td><a href="https://cplusplus.github.io/CWG/issues/993.html">993</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>C++11</td>
     <td>Freedom to perform instantiation at the end of the translation unit</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="994">
     <td><a href="https://cplusplus.github.io/CWG/issues/994.html">994</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td><I>braced-init-list</I> as a default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="995">
     <td><a href="https://cplusplus.github.io/CWG/issues/995.html">995</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Incorrect example for <I>using-declaration</I> and explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="996">
     <td><a href="https://cplusplus.github.io/CWG/issues/996.html">996</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>C++11</td>
     <td>Ambiguous partial specializations of member class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="997">
     <td><a href="https://cplusplus.github.io/CWG/issues/997.html">997</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++11</td>
     <td>Argument-dependent lookup and dependent function template parameter types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="998">
     <td><a href="https://cplusplus.github.io/CWG/issues/998.html">998</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>dup</td>
     <td>Function parameter transformations and template functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="999">
     <td><a href="https://cplusplus.github.io/CWG/issues/999.html">999</a></td>
+    <td>[<a href="https://wg21.link/over.match">over.match</a>]</td>
     <td>CD2</td>
     <td>&#8220;Implicit&#8221; or &#8220;implied&#8221; object argument/parameter?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1000">
     <td><a href="https://cplusplus.github.io/CWG/issues/1000.html">1000</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD2</td>
     <td>Mistaking member typedefs for constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1001">
     <td><a href="https://cplusplus.github.io/CWG/issues/1001.html">1001</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>review</td>
     <td>Parameter type adjustment in dependent parameter types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1002">
     <td><a href="https://cplusplus.github.io/CWG/issues/1002.html">1002</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>NAD</td>
     <td>Pack expansion for function arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1003">
     <td><a href="https://cplusplus.github.io/CWG/issues/1003.html">1003</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD3</td>
     <td>Acceptable definitions of <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1004">
     <td><a href="https://cplusplus.github.io/CWG/issues/1004.html">1004</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Injected-class-names as arguments for template template parameters</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="1005">
     <td><a href="https://cplusplus.github.io/CWG/issues/1005.html">1005</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>NAD</td>
     <td>Qualified name resolution in member functions of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1006">
     <td><a href="https://cplusplus.github.io/CWG/issues/1006.html">1006</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td><TT>std::nullptr_t</TT> as a non-type template parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1007">
     <td><a href="https://cplusplus.github.io/CWG/issues/1007.html">1007</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>NAD</td>
     <td>Protected access and pointers to members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1008">
     <td><a href="https://cplusplus.github.io/CWG/issues/1008.html">1008</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>NAD</td>
     <td>Querying the alignment of an object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1009">
     <td><a href="https://cplusplus.github.io/CWG/issues/1009.html">1009</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>C++11</td>
     <td>Missing cases in the <I>declarator-id</I> of a function template declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1010">
     <td><a href="https://cplusplus.github.io/CWG/issues/1010.html">1010</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Address of object with dynamic storage duration in constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1011">
     <td><a href="https://cplusplus.github.io/CWG/issues/1011.html">1011</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++11</td>
     <td>Standard conversions that cannot be inverted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1012">
     <td><a href="https://cplusplus.github.io/CWG/issues/1012.html">1012</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>C++11</td>
     <td>Undeprecating <TT>static</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1013">
     <td><a href="https://cplusplus.github.io/CWG/issues/1013.html">1013</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD3</td>
     <td>Uninitialized <TT>std::nullptr_t</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1014">
     <td><a href="https://cplusplus.github.io/CWG/issues/1014.html">1014</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Overload resolution between <TT>const T&amp;</TT> and <TT>T&amp;&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1015">
     <td><a href="https://cplusplus.github.io/CWG/issues/1015.html">1015</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++11</td>
     <td>Template arguments and argument-dependent lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1016">
     <td><a href="https://cplusplus.github.io/CWG/issues/1016.html">1016</a></td>
+    <td>[<a href="https://wg21.link/over">over</a>]</td>
     <td>C++11</td>
     <td>Overloadable declarations, function templates, and references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1017">
     <td><a href="https://cplusplus.github.io/CWG/issues/1017.html">1017</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Member access transformation in unevaluated operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1018">
     <td><a href="https://cplusplus.github.io/CWG/issues/1018.html">1018</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++11</td>
     <td>Ambiguity between <I>simple-declaration</I> and <I>attribute-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1019">
     <td><a href="https://cplusplus.github.io/CWG/issues/1019.html">1019</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>dup</td>
     <td>Dependent <I>simple-template-id</I>s in <I>base-specifier</I>s and <I>mem-initializer</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1020">
     <td><a href="https://cplusplus.github.io/CWG/issues/1020.html">1020</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Implicitly-defined copy constructors and explicit base class constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1021">
     <td><a href="https://cplusplus.github.io/CWG/issues/1021.html">1021</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD4</td>
     <td>Definitions of namespace members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1022">
     <td><a href="https://cplusplus.github.io/CWG/issues/1022.html">1022</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++11</td>
     <td>Can an enumeration variable have values outside the values of the enumeration?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1023">
     <td><a href="https://cplusplus.github.io/CWG/issues/1023.html">1023</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>dup</td>
     <td><TT>thread_local</TT> objects as non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1024">
     <td><a href="https://cplusplus.github.io/CWG/issues/1024.html">1024</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD3</td>
     <td>Limits on multicharacter literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1025">
     <td><a href="https://cplusplus.github.io/CWG/issues/1025.html">1025</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Use of a reference as a non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1026">
     <td><a href="https://cplusplus.github.io/CWG/issues/1026.html">1026</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Cv-qualified non-class rvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1027">
     <td><a href="https://cplusplus.github.io/CWG/issues/1027.html">1027</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Type consistency and reallocation of scalar types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1028">
     <td><a href="https://cplusplus.github.io/CWG/issues/1028.html">1028</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>CD6</td>
     <td>Dependent names in non-defining declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1029">
     <td><a href="https://cplusplus.github.io/CWG/issues/1029.html">1029</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Type of a destructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1030">
     <td><a href="https://cplusplus.github.io/CWG/issues/1030.html">1030</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Evaluation order in <I>initializer-list</I>s used in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1031">
     <td><a href="https://cplusplus.github.io/CWG/issues/1031.html">1031</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++11</td>
     <td>Optional elements in attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1032">
     <td><a href="https://cplusplus.github.io/CWG/issues/1032.html">1032</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Empty pack expansions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1033">
     <td><a href="https://cplusplus.github.io/CWG/issues/1033.html">1033</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>C++11</td>
     <td>Restrictions on alignment attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1034">
     <td><a href="https://cplusplus.github.io/CWG/issues/1034.html">1034</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++11</td>
     <td>Attributes for <TT>return</TT> statements in lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1035">
     <td><a href="https://cplusplus.github.io/CWG/issues/1035.html">1035</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td>Omitted and required <I>decl-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1036">
     <td><a href="https://cplusplus.github.io/CWG/issues/1036.html">1036</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>C++11</td>
     <td>Alignment attribute in an <I>exception-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1037">
     <td><a href="https://cplusplus.github.io/CWG/issues/1037.html">1037</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++11</td>
     <td>Requirements for operands of <I>delete-expression</I>s and deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD7</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1039">
     <td><a href="https://cplusplus.github.io/CWG/issues/1039.html">1039</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>dup</td>
     <td>Coordinating C and C++ alignment specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1040">
     <td><a href="https://cplusplus.github.io/CWG/issues/1040.html">1040</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Memory model issues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1041">
     <td><a href="https://cplusplus.github.io/CWG/issues/1041.html">1041</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>dup</td>
     <td><I>alias-declaration</I>s as class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1042">
     <td><a href="https://cplusplus.github.io/CWG/issues/1042.html">1042</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++11</td>
     <td>Attributes in <I>alias-declaration</I>s</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1043">
     <td><a href="https://cplusplus.github.io/CWG/issues/1043.html">1043</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Qualified name lookup in the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1044">
     <td><a href="https://cplusplus.github.io/CWG/issues/1044.html">1044</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++11</td>
     <td>Point of declaration for an <I>alias-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1045">
     <td><a href="https://cplusplus.github.io/CWG/issues/1045.html">1045</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Requiring explicit instantiation declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1046">
     <td><a href="https://cplusplus.github.io/CWG/issues/1046.html">1046</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>open</td>
     <td>What is a &#8220;use&#8221; of a class specialization?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1047">
     <td><a href="https://cplusplus.github.io/CWG/issues/1047.html">1047</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>When is <TT>typeid</TT> value-dependent?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1048">
     <td><a href="https://cplusplus.github.io/CWG/issues/1048.html">1048</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td><TT>auto</TT> deduction and lambda return type deduction.</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="1049">
     <td><a href="https://cplusplus.github.io/CWG/issues/1049.html">1049</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>open</td>
     <td>Copy elision through reference parameters of inline functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1050">
     <td><a href="https://cplusplus.github.io/CWG/issues/1050.html">1050</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Effects of thread support on object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1051">
     <td><a href="https://cplusplus.github.io/CWG/issues/1051.html">1051</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Reference members and generated copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1052">
     <td><a href="https://cplusplus.github.io/CWG/issues/1052.html">1052</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>dup</td>
     <td><TT>const</TT> non-static data member and PODness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1053">
     <td><a href="https://cplusplus.github.io/CWG/issues/1053.html">1053</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Terminate vs undefined behavior for noexcept violation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1054">
     <td><a href="https://cplusplus.github.io/CWG/issues/1054.html">1054</a></td>
+    <td>[<a href="https://wg21.link/stmt.expr">stmt.expr</a>]</td>
     <td>C++11</td>
     <td>Lvalue-to-rvalue conversions in expression statements</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1055">
     <td><a href="https://cplusplus.github.io/CWG/issues/1055.html">1055</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++11</td>
     <td>Permissible uses of <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1056">
     <td><a href="https://cplusplus.github.io/CWG/issues/1056.html">1056</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Template aliases, member definitions, and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1057">
     <td><a href="https://cplusplus.github.io/CWG/issues/1057.html">1057</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td><TT>decltype</TT> and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1058">
     <td><a href="https://cplusplus.github.io/CWG/issues/1058.html">1058</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Reference binding of incompatible array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1059">
     <td><a href="https://cplusplus.github.io/CWG/issues/1059.html">1059</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>CD3</td>
     <td>Cv-qualified array types (with rvalues)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1060">
     <td><a href="https://cplusplus.github.io/CWG/issues/1060.html">1060</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Scoped enumerators in integral constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1061">
     <td><a href="https://cplusplus.github.io/CWG/issues/1061.html">1061</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++11</td>
     <td>Negative array bounds in a <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1062">
     <td><a href="https://cplusplus.github.io/CWG/issues/1062.html">1062</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++11</td>
     <td>Syntax of <I>attribute-specifier</I>s in lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1063">
     <td><a href="https://cplusplus.github.io/CWG/issues/1063.html">1063</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td><TT>[[hiding]]</TT> with non-attribute declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1064">
     <td><a href="https://cplusplus.github.io/CWG/issues/1064.html">1064</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Defaulted move constructor for a union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1065">
     <td><a href="https://cplusplus.github.io/CWG/issues/1065.html">1065</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td><TT>[[hiding]]</TT> with <TT>[[override]]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1066">
     <td><a href="https://cplusplus.github.io/CWG/issues/1066.html">1066</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>C++11</td>
     <td>When is a copy/move assignment operator implicitly defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1067">
     <td><a href="https://cplusplus.github.io/CWG/issues/1067.html">1067</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>NAD</td>
     <td><TT>[[hiding]]</TT>, <I>using-declaration</I>s, and multiple inheritance</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1068">
     <td><a href="https://cplusplus.github.io/CWG/issues/1068.html">1068</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template aliases with default arguments and template parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1069">
     <td><a href="https://cplusplus.github.io/CWG/issues/1069.html">1069</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Incorrect function type with <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1070">
     <td><a href="https://cplusplus.github.io/CWG/issues/1070.html">1070</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Missing initializer clauses in aggregate initialization</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1071">
     <td><a href="https://cplusplus.github.io/CWG/issues/1071.html">1071</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Literal class types and trivial default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1072">
     <td><a href="https://cplusplus.github.io/CWG/issues/1072.html">1072</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td>Scoped enumerator with the same name as its containing class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1073">
     <td><a href="https://cplusplus.github.io/CWG/issues/1073.html">1073</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td>Merging <I>dynamic-exception-specification</I>s and <I>noexcept-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1074">
     <td><a href="https://cplusplus.github.io/CWG/issues/1074.html">1074</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>Value-dependent <I>noexcept-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1075">
     <td><a href="https://cplusplus.github.io/CWG/issues/1075.html">1075</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Grammar does not allow template alias in <I>type-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1076">
     <td><a href="https://cplusplus.github.io/CWG/issues/1076.html">1076</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD5</td>
     <td>Value categories and lvalue temporaries</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1077">
     <td><a href="https://cplusplus.github.io/CWG/issues/1077.html">1077</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Explicit specializations in non-containing namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1078">
     <td><a href="https://cplusplus.github.io/CWG/issues/1078.html">1078</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Narrowing and the usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1079">
     <td><a href="https://cplusplus.github.io/CWG/issues/1079.html">1079</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>C++11</td>
     <td>Overload resolution involving aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1080">
     <td><a href="https://cplusplus.github.io/CWG/issues/1080.html">1080</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Confusing relationship between templates and copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1081">
     <td><a href="https://cplusplus.github.io/CWG/issues/1081.html">1081</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Defaulted destructor and unusable operator delete</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1082">
     <td><a href="https://cplusplus.github.io/CWG/issues/1082.html">1082</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Implicit copy function if subobject has none?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1083">
     <td><a href="https://cplusplus.github.io/CWG/issues/1083.html">1083</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++11</td>
     <td>Passing an object to ellipsis with non-trivial move constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1084">
     <td><a href="https://cplusplus.github.io/CWG/issues/1084.html">1084</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Conditions for a deleted move function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1085">
     <td><a href="https://cplusplus.github.io/CWG/issues/1085.html">1085</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Move assignment operators and virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1086">
     <td><a href="https://cplusplus.github.io/CWG/issues/1086.html">1086</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>C++11</td>
     <td><TT>const_cast</TT> to rvalue reference to function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1087">
     <td><a href="https://cplusplus.github.io/CWG/issues/1087.html">1087</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>C++11</td>
     <td>Additional applications of issue 899</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1088">
     <td><a href="https://cplusplus.github.io/CWG/issues/1088.html">1088</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>Dependent non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1089">
     <td><a href="https://cplusplus.github.io/CWG/issues/1089.html">1089</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>open</td>
     <td>Template parameters in member selections</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1090">
     <td><a href="https://cplusplus.github.io/CWG/issues/1090.html">1090</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>Alignment of subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1091">
     <td><a href="https://cplusplus.github.io/CWG/issues/1091.html">1091</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>C++11</td>
     <td>Inconsistent use of the term &#8220;object expression&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1092">
     <td><a href="https://cplusplus.github.io/CWG/issues/1092.html">1092</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Cycles in overload resolution during instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1093">
     <td><a href="https://cplusplus.github.io/CWG/issues/1093.html">1093</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value-initializing non-objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1094">
     <td><a href="https://cplusplus.github.io/CWG/issues/1094.html">1094</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++11</td>
     <td>Converting floating-point values to scoped enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1095">
     <td><a href="https://cplusplus.github.io/CWG/issues/1095.html">1095</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++11</td>
     <td>List-initialization of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1096">
     <td><a href="https://cplusplus.github.io/CWG/issues/1096.html">1096</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>C++11</td>
     <td>Missing requirement for template definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1097">
     <td><a href="https://cplusplus.github.io/CWG/issues/1097.html">1097</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Aggregate initialization of function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1098">
     <td><a href="https://cplusplus.github.io/CWG/issues/1098.html">1098</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Pointer conversions in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1099">
     <td><a href="https://cplusplus.github.io/CWG/issues/1099.html">1099</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Infinite recursion in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1100">
     <td><a href="https://cplusplus.github.io/CWG/issues/1100.html">1100</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> conversion functions and non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1101">
     <td><a href="https://cplusplus.github.io/CWG/issues/1101.html">1101</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>C++11</td>
     <td>Non-integral initialized static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1102">
     <td><a href="https://cplusplus.github.io/CWG/issues/1102.html">1102</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++11</td>
     <td>Better example of undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1103">
     <td><a href="https://cplusplus.github.io/CWG/issues/1103.html">1103</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++11</td>
     <td>Reversion of phase 1 and 2 transformations in raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1104">
     <td><a href="https://cplusplus.github.io/CWG/issues/1104.html">1104</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>C++11</td>
     <td>Global-scope template arguments vs the <TT>&lt;:</TT> digraph</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1105">
     <td><a href="https://cplusplus.github.io/CWG/issues/1105.html">1105</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>C++11</td>
     <td>Issues relating to TR 10176:2003</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1106">
     <td><a href="https://cplusplus.github.io/CWG/issues/1106.html">1106</a></td>
+    <td>[<a href="https://wg21.link/lex.nullptr">lex.nullptr</a>]</td>
     <td>C++11</td>
     <td>Need more detail in <TT>nullptr</TT> keyword description</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1107">
     <td><a href="https://cplusplus.github.io/CWG/issues/1107.html">1107</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Overload resolution for user-defined integer literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1108">
     <td><a href="https://cplusplus.github.io/CWG/issues/1108.html">1108</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>User-defined literals have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1109">
     <td><a href="https://cplusplus.github.io/CWG/issues/1109.html">1109</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>When is &#8220;use&#8221; a reference to the ODR meaning?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1110">
     <td><a href="https://cplusplus.github.io/CWG/issues/1110.html">1110</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Incomplete return type should be allowed in <TT>decltype</TT> operand</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1111">
     <td><a href="https://cplusplus.github.io/CWG/issues/1111.html">1111</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>C++11</td>
     <td>Remove dual-scope lookup of member template names</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1112">
     <td><a href="https://cplusplus.github.io/CWG/issues/1112.html">1112</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> variables should have internal linkage like <TT>const</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1113">
     <td><a href="https://cplusplus.github.io/CWG/issues/1113.html">1113</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++11</td>
     <td>Linkage of namespace member of unnamed namespace</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1114">
     <td><a href="https://cplusplus.github.io/CWG/issues/1114.html">1114</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>C++11</td>
     <td>Incorrect use of placement <TT>new</TT> in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1115">
     <td><a href="https://cplusplus.github.io/CWG/issues/1115.html">1115</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>C-compatible alignment specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1116">
     <td><a href="https://cplusplus.github.io/CWG/issues/1116.html">1116</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Aliasing of union members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1117">
     <td><a href="https://cplusplus.github.io/CWG/issues/1117.html">1117</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++11</td>
     <td>Incorrect note about xvalue member access expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1118">
     <td><a href="https://cplusplus.github.io/CWG/issues/1118.html">1118</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Implicit lambda capture via explicit copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1119">
     <td><a href="https://cplusplus.github.io/CWG/issues/1119.html">1119</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>C++11</td>
     <td>Missing case in description of member access ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1120">
     <td><a href="https://cplusplus.github.io/CWG/issues/1120.html">1120</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>C++11</td>
     <td><TT>reinterpret_cast</TT> and <TT>void*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1121">
     <td><a href="https://cplusplus.github.io/CWG/issues/1121.html">1121</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>C++11</td>
     <td>Unnecessary ambiguity error in formation of pointer to member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1122">
     <td><a href="https://cplusplus.github.io/CWG/issues/1122.html">1122</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>C++11</td>
     <td>Circular definition of <TT>std::size_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1123">
     <td><a href="https://cplusplus.github.io/CWG/issues/1123.html">1123</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>C++11</td>
     <td>Destructors should be <TT>noexcept</TT> by default</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1124">
     <td><a href="https://cplusplus.github.io/CWG/issues/1124.html">1124</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>NAD</td>
     <td>Error in description of value category of pointer-to-member expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1125">
     <td><a href="https://cplusplus.github.io/CWG/issues/1125.html">1125</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Unclear definition of &#8220;potential constant expression&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1126">
     <td><a href="https://cplusplus.github.io/CWG/issues/1126.html">1126</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> functions in <TT>const</TT> initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1127">
     <td><a href="https://cplusplus.github.io/CWG/issues/1127.html">1127</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Overload resolution in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1128">
     <td><a href="https://cplusplus.github.io/CWG/issues/1128.html">1128</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec">dcl.spec</a>]</td>
     <td>C++11</td>
     <td><I>attribute-specifier</I>s in <I>decl-specifier-seq</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1129">
     <td><a href="https://cplusplus.github.io/CWG/issues/1129.html">1129</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Default <TT>nothrow</TT> for <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1130">
     <td><a href="https://cplusplus.github.io/CWG/issues/1130.html">1130</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Function parameter type adjustments and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1131">
     <td><a href="https://cplusplus.github.io/CWG/issues/1131.html">1131</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>C++11</td>
     <td>Template aliases in <I>elaborated-type-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1132">
     <td><a href="https://cplusplus.github.io/CWG/issues/1132.html">1132</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.noreturn">dcl.attr.noreturn</a>]</td>
     <td>NAD</td>
     <td>Keyword vs attribute for <TT>noreturn</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1133">
     <td><a href="https://cplusplus.github.io/CWG/issues/1133.html">1133</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td>Keywords vs attributes for control of hiding and overriding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1134">
     <td><a href="https://cplusplus.github.io/CWG/issues/1134.html">1134</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>When is an explicitly-defaulted function defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1135">
     <td><a href="https://cplusplus.github.io/CWG/issues/1135.html">1135</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted non-public special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1136">
     <td><a href="https://cplusplus.github.io/CWG/issues/1136.html">1136</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted explicit constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1137">
     <td><a href="https://cplusplus.github.io/CWG/issues/1137.html">1137</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted virtual special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1138">
     <td><a href="https://cplusplus.github.io/CWG/issues/1138.html">1138</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Rvalue-ness check for rvalue reference binding is wrong</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1139">
     <td><a href="https://cplusplus.github.io/CWG/issues/1139.html">1139</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Rvalue reference binding to scalar xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1140">
     <td><a href="https://cplusplus.github.io/CWG/issues/1140.html">1140</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Incorrect redefinition of POD class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1141">
     <td><a href="https://cplusplus.github.io/CWG/issues/1141.html">1141</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td>Non-static data member initializers have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1142">
     <td><a href="https://cplusplus.github.io/CWG/issues/1142.html">1142</a></td>
+    <td>[<a href="https://wg21.link/class.mfct">class.mfct</a>]</td>
     <td>C++11</td>
     <td><TT>friend</TT> declaration of member function of containing class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1143">
     <td><a href="https://cplusplus.github.io/CWG/issues/1143.html">1143</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>NAD</td>
     <td>Move semantics for <TT>*this</TT> have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1144">
     <td><a href="https://cplusplus.github.io/CWG/issues/1144.html">1144</a></td>
+    <td>[<a href="https://wg21.link/class.access.dcl">class.access.dcl</a>]</td>
     <td>C++11</td>
     <td>Remove access declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1145">
     <td><a href="https://cplusplus.github.io/CWG/issues/1145.html">1145</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td>Defaulting and triviality</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1146">
     <td><a href="https://cplusplus.github.io/CWG/issues/1146.html">1146</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td><I>exception-specification</I>s of defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1147">
     <td><a href="https://cplusplus.github.io/CWG/issues/1147.html">1147</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Destructors should be default <TT>nothrow</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1148">
     <td><a href="https://cplusplus.github.io/CWG/issues/1148.html">1148</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>C++11</td>
     <td>Copy elision and move construction of function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1149">
     <td><a href="https://cplusplus.github.io/CWG/issues/1149.html">1149</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Trivial non-public copy operators in subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1150">
     <td><a href="https://cplusplus.github.io/CWG/issues/1150.html">1150</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>NAD</td>
     <td>Inheriting constructors have not been implemented</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1151">
     <td><a href="https://cplusplus.github.io/CWG/issues/1151.html">1151</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>C++11</td>
     <td>Overload resolution with initializer-list and non-list constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1152">
     <td><a href="https://cplusplus.github.io/CWG/issues/1152.html">1152</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>C++11</td>
     <td>Rules for determining existence of implicit conversion sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1153">
     <td><a href="https://cplusplus.github.io/CWG/issues/1153.html">1153</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>C++11</td>
     <td>Type matching in address of overloaded function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1154">
     <td><a href="https://cplusplus.github.io/CWG/issues/1154.html">1154</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Address of <TT>thread_local</TT> variable as non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1155">
     <td><a href="https://cplusplus.github.io/CWG/issues/1155.html">1155</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Internal-linkage non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1156">
     <td><a href="https://cplusplus.github.io/CWG/issues/1156.html">1156</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>Partial ordering in a non-call context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1157">
     <td><a href="https://cplusplus.github.io/CWG/issues/1157.html">1157</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Partial ordering of function templates is still underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1158">
     <td><a href="https://cplusplus.github.io/CWG/issues/1158.html">1158</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Recursive instantiation via alias template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1159">
     <td><a href="https://cplusplus.github.io/CWG/issues/1159.html">1159</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Class and enumeration definitions in template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1160">
     <td><a href="https://cplusplus.github.io/CWG/issues/1160.html">1160</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Definitions of template members and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1161">
     <td><a href="https://cplusplus.github.io/CWG/issues/1161.html">1161</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>C++11</td>
     <td>Dependent <I>nested-name-specifier</I> in a pointer-to-member declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1162">
     <td><a href="https://cplusplus.github.io/CWG/issues/1162.html">1162</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Dependent <I>elaborated-type-specifier</I>s in non-deduced contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1163">
     <td><a href="https://cplusplus.github.io/CWG/issues/1163.html">1163</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td><TT>extern template</TT> prevents inlining functions not marked <TT>inline</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1164">
     <td><a href="https://cplusplus.github.io/CWG/issues/1164.html">1164</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>C++11</td>
     <td>Partial ordering of <TT>f(T&amp;)</TT> and <TT>f(T&amp;&amp;)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1165">
     <td><a href="https://cplusplus.github.io/CWG/issues/1165.html">1165</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++11</td>
     <td>Exceptions when destroying array elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1166">
     <td><a href="https://cplusplus.github.io/CWG/issues/1166.html">1166</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td><I>exception-declaration</I>s that do not declare objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1167">
     <td><a href="https://cplusplus.github.io/CWG/issues/1167.html">1167</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td><I>function-try-block</I>s for destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1168">
     <td><a href="https://cplusplus.github.io/CWG/issues/1168.html">1168</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>C++11</td>
     <td>Additional reasons to call <TT>std::terminate</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1169">
     <td><a href="https://cplusplus.github.io/CWG/issues/1169.html">1169</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++11</td>
     <td>Missing feature macro for strict pointer safety</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1170">
     <td><a href="https://cplusplus.github.io/CWG/issues/1170.html">1170</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Access checking during template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1171">
     <td><a href="https://cplusplus.github.io/CWG/issues/1171.html">1171</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>C++11</td>
     <td>Partial stack unwinding with <TT>noexcept</TT> violation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1172">
     <td><a href="https://cplusplus.github.io/CWG/issues/1172.html">1172</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>&#8220;instantiation-dependent&#8221; constructs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1173">
     <td><a href="https://cplusplus.github.io/CWG/issues/1173.html">1173</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++11</td>
     <td>Unclear specification of effects of signal handling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1174">
     <td><a href="https://cplusplus.github.io/CWG/issues/1174.html">1174</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>When is a pure virtual function &#8220;used?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1175">
     <td><a href="https://cplusplus.github.io/CWG/issues/1175.html">1175</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Disambiguating user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1176">
     <td><a href="https://cplusplus.github.io/CWG/issues/1176.html">1176</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++11</td>
     <td>Definition of release sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1177">
     <td><a href="https://cplusplus.github.io/CWG/issues/1177.html">1177</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++11</td>
     <td>Intra-thread dependency-ordered-before</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1178">
     <td><a href="https://cplusplus.github.io/CWG/issues/1178.html">1178</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.decl">temp.deduct.decl</a>]</td>
     <td>C++11</td>
     <td>Deduction failure matching placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1179">
     <td><a href="https://cplusplus.github.io/CWG/issues/1179.html">1179</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Cv-qualification of non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1180">
     <td><a href="https://cplusplus.github.io/CWG/issues/1180.html">1180</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>Over-aligned class types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1181">
     <td><a href="https://cplusplus.github.io/CWG/issues/1181.html">1181</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>What is a &#8220;built-in type?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1182">
     <td><a href="https://cplusplus.github.io/CWG/issues/1182.html">1182</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of pack expansion syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1183">
     <td><a href="https://cplusplus.github.io/CWG/issues/1183.html">1183</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Expansion of parameter packs in declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1184">
     <td><a href="https://cplusplus.github.io/CWG/issues/1184.html">1184</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>C++11</td>
     <td>Argument conversions to nondeduced parameter types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1185">
     <td><a href="https://cplusplus.github.io/CWG/issues/1185.html">1185</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++11</td>
     <td>Misleading description of language linkage and member function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1186">
     <td><a href="https://cplusplus.github.io/CWG/issues/1186.html">1186</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Non-dependent <TT>constexpr</TT> violations in function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1187">
     <td><a href="https://cplusplus.github.io/CWG/issues/1187.html">1187</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++11</td>
     <td>Problems in initialization example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1188">
     <td><a href="https://cplusplus.github.io/CWG/issues/1188.html">1188</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Type punning in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1189">
     <td><a href="https://cplusplus.github.io/CWG/issues/1189.html">1189</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>C++11</td>
     <td>Address of distinct base class subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1190">
     <td><a href="https://cplusplus.github.io/CWG/issues/1190.html">1190</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>C++11</td>
     <td>Operations on non-safely-derived pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1191">
     <td><a href="https://cplusplus.github.io/CWG/issues/1191.html">1191</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td>Deleted subobject destructors and implicitly-defined constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1192">
     <td><a href="https://cplusplus.github.io/CWG/issues/1192.html">1192</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>Inadvertent change to ODR and templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1193">
     <td><a href="https://cplusplus.github.io/CWG/issues/1193.html">1193</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Use of address-constant pointers in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1194">
     <td><a href="https://cplusplus.github.io/CWG/issues/1194.html">1194</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1195">
     <td><a href="https://cplusplus.github.io/CWG/issues/1195.html">1195</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>References to non-literal types in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1196">
     <td><a href="https://cplusplus.github.io/CWG/issues/1196.html">1196</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>C++11</td>
     <td>Definition required for explicit instantiation after explicit specialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1197">
     <td><a href="https://cplusplus.github.io/CWG/issues/1197.html">1197</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Constexpr arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1198">
     <td><a href="https://cplusplus.github.io/CWG/issues/1198.html">1198</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Literal types and copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1199">
     <td><a href="https://cplusplus.github.io/CWG/issues/1199.html">1199</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Deleted constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1200">
     <td><a href="https://cplusplus.github.io/CWG/issues/1200.html">1200</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Lookup rules for template parameters</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1201">
     <td><a href="https://cplusplus.github.io/CWG/issues/1201.html">1201</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td>Are deleted and defaulted functions definitions?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1202">
     <td><a href="https://cplusplus.github.io/CWG/issues/1202.html">1202</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>C++11</td>
     <td>Calling virtual functions during destruction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1203">
     <td><a href="https://cplusplus.github.io/CWG/issues/1203.html">1203</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>dup</td>
     <td>Misleading note regarding initialized static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1204">
     <td><a href="https://cplusplus.github.io/CWG/issues/1204.html">1204</a></td>
+    <td>[<a href="https://wg21.link/stmt.iter">stmt.iter</a>]</td>
     <td>C++11</td>
     <td>Specifiers in a <I>for-range-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1205">
     <td><a href="https://cplusplus.github.io/CWG/issues/1205.html">1205</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>dup</td>
     <td>Lvalue reference binding and function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1206">
     <td><a href="https://cplusplus.github.io/CWG/issues/1206.html">1206</a></td>
+    <td>[<a href="https://wg21.link/temp.class">temp.class</a>]</td>
     <td>C++11</td>
     <td>Defining opaque enumeration members of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1207">
     <td><a href="https://cplusplus.github.io/CWG/issues/1207.html">1207</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Type of class member in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1208">
     <td><a href="https://cplusplus.github.io/CWG/issues/1208.html">1208</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Explicit <TT>noexcept</TT> in defaulted definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1209">
     <td><a href="https://cplusplus.github.io/CWG/issues/1209.html">1209</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Is a potentially-evaluated expression in a template definition a &#8220;use?&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1210">
     <td><a href="https://cplusplus.github.io/CWG/issues/1210.html">1210</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++11</td>
     <td>Injection of <I>elaborated-type-specifier</I> in enumeration scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1211">
     <td><a href="https://cplusplus.github.io/CWG/issues/1211.html">1211</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>open</td>
     <td>Misaligned lvalues</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1212">
     <td><a href="https://cplusplus.github.io/CWG/issues/1212.html">1212</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Non-function-call xvalues and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1213">
     <td><a href="https://cplusplus.github.io/CWG/issues/1213.html">1213</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>CD3</td>
     <td>Array subscripting and xvalues</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1214">
     <td><a href="https://cplusplus.github.io/CWG/issues/1214.html">1214</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Kinds of initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1215">
     <td><a href="https://cplusplus.github.io/CWG/issues/1215.html">1215</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Definition of POD struct</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1216">
     <td><a href="https://cplusplus.github.io/CWG/issues/1216.html">1216</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td>Exceptions &#8220;allowed&#8221; by a <I>noexcept-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1217">
     <td><a href="https://cplusplus.github.io/CWG/issues/1217.html">1217</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.delete">dcl.fct.def.delete</a>]</td>
     <td>NAD</td>
     <td>Are deleted functions implicitly <TT>noexcept</TT>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1218">
     <td><a href="https://cplusplus.github.io/CWG/issues/1218.html">1218</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td>What is the &#8220;currently-handled exception&#8221; in a multi-threaded program?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1219">
     <td><a href="https://cplusplus.github.io/CWG/issues/1219.html">1219</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Non-static data member initializers in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1220">
     <td><a href="https://cplusplus.github.io/CWG/issues/1220.html">1220</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>C++11</td>
     <td>Looking up <I>conversion-type-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1221">
     <td><a href="https://cplusplus.github.io/CWG/issues/1221.html">1221</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>open</td>
     <td>Partial ordering and reference collapsing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1222">
     <td><a href="https://cplusplus.github.io/CWG/issues/1222.html">1222</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Unnecessary restriction on <TT>auto</TT> array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1223">
     <td><a href="https://cplusplus.github.io/CWG/issues/1223.html">1223</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>CD7</td>
     <td>Syntactic disambiguation and <I>trailing-return-type</I>s</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="1224">
     <td><a href="https://cplusplus.github.io/CWG/issues/1224.html">1224</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> defaulted copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1225">
     <td><a href="https://cplusplus.github.io/CWG/issues/1225.html">1225</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> constructors and virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1226">
     <td><a href="https://cplusplus.github.io/CWG/issues/1226.html">1226</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD3</td>
     <td>Converting a <I>braced-init-list</I> default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1227">
     <td><a href="https://cplusplus.github.io/CWG/issues/1227.html">1227</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Mixing immediate and non-immediate contexts in deduction failure</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1228">
     <td><a href="https://cplusplus.github.io/CWG/issues/1228.html">1228</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>NAD</td>
     <td>Copy-list-initialization and <TT>explicit</TT> constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1229">
     <td><a href="https://cplusplus.github.io/CWG/issues/1229.html">1229</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>C++11</td>
     <td>Overload resolution with empty <I>braced-init-list</I> argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1230">
     <td><a href="https://cplusplus.github.io/CWG/issues/1230.html">1230</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>dup</td>
     <td>Confusing description of ambiguity of destructor name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1231">
     <td><a href="https://cplusplus.github.io/CWG/issues/1231.html">1231</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Variadic templates requiring an empty pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1232">
     <td><a href="https://cplusplus.github.io/CWG/issues/1232.html">1232</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++11</td>
     <td>Creation of array temporaries using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1233">
     <td><a href="https://cplusplus.github.io/CWG/issues/1233.html">1233</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>C++11</td>
     <td>Pack expansions and dependent calls</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1234">
     <td><a href="https://cplusplus.github.io/CWG/issues/1234.html">1234</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>C++11</td>
     <td><I>abstract-declarator</I> does not permit <TT>...</TT> after <I>ptr-operator</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1235">
     <td><a href="https://cplusplus.github.io/CWG/issues/1235.html">1235</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>&#8220;Unused&#8221; ellipsis and default arguments in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1236">
     <td><a href="https://cplusplus.github.io/CWG/issues/1236.html">1236</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Inconsistently-interrelated examples</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1237">
     <td><a href="https://cplusplus.github.io/CWG/issues/1237.html">1237</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>C++11</td>
     <td>Deprecated implicit copy assignment in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1238">
     <td><a href="https://cplusplus.github.io/CWG/issues/1238.html">1238</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>C++11</td>
     <td>Overloading ambiguity binding reference to function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1239">
     <td><a href="https://cplusplus.github.io/CWG/issues/1239.html">1239</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Hexadecimal floating-point literals vs user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1240">
     <td><a href="https://cplusplus.github.io/CWG/issues/1240.html">1240</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1241">
     <td><a href="https://cplusplus.github.io/CWG/issues/1241.html">1241</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Which members does a destructor destroy?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1242">
     <td><a href="https://cplusplus.github.io/CWG/issues/1242.html">1242</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Initializing variant class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1243">
     <td><a href="https://cplusplus.github.io/CWG/issues/1243.html">1243</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>C++11</td>
     <td>Misleading footnote regarding multiple-declarator declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1244">
     <td><a href="https://cplusplus.github.io/CWG/issues/1244.html">1244</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>C++11</td>
     <td>Equivalence of alias templates and class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1245">
     <td><a href="https://cplusplus.github.io/CWG/issues/1245.html">1245</a></td>
+    <td>[<a href="https://wg21.link/temp.mem.func">temp.mem.func</a>]</td>
     <td>C++11</td>
     <td>Matching declarations involving <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1246">
     <td><a href="https://cplusplus.github.io/CWG/issues/1246.html">1246</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Non-deduced non-final parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1247">
     <td><a href="https://cplusplus.github.io/CWG/issues/1247.html">1247</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD4</td>
     <td>Restriction on alias name appearing in <I>type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1248">
     <td><a href="https://cplusplus.github.io/CWG/issues/1248.html">1248</a></td>
+    <td>[<a href="https://wg21.link/diff.iso">diff.iso</a>]</td>
     <td>open</td>
     <td>Updating Annex C to C99 and C23</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1249">
     <td><a href="https://cplusplus.github.io/CWG/issues/1249.html">1249</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of nested lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1250">
     <td><a href="https://cplusplus.github.io/CWG/issues/1250.html">1250</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD3</td>
     <td>Cv-qualification of incomplete virtual function return types</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1251">
     <td><a href="https://cplusplus.github.io/CWG/issues/1251.html">1251</a></td>
+    <td>[<a href="https://wg21.link/diff.conv">diff.conv</a>]</td>
     <td>CD3</td>
     <td>C compatibility: casting to unqualified <TT>void*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1252">
     <td><a href="https://cplusplus.github.io/CWG/issues/1252.html">1252</a></td>
+    <td>[<a href="https://wg21.link/over.load">over.load</a>]</td>
     <td>CD6</td>
     <td>Overloading member function templates based on dependent return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1253">
     <td><a href="https://cplusplus.github.io/CWG/issues/1253.html">1253</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>C++17</td>
     <td>Generic non-template members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1254">
     <td><a href="https://cplusplus.github.io/CWG/issues/1254.html">1254</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>odr-use vs template arguments and constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1255">
     <td><a href="https://cplusplus.github.io/CWG/issues/1255.html">1255</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Definition problems with <TT>constexpr</TT> functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1256">
     <td><a href="https://cplusplus.github.io/CWG/issues/1256.html">1256</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Unevaluated operands are not necessarily constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1257">
     <td><a href="https://cplusplus.github.io/CWG/issues/1257.html">1257</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>open</td>
     <td>Instantiation via non-dependent references in uninstantiated templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1258">
     <td><a href="https://cplusplus.github.io/CWG/issues/1258.html">1258</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>CD5</td>
     <td>&#8220;Instantiation context&#8221; differs from dependent lookup rules</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1259">
     <td><a href="https://cplusplus.github.io/CWG/issues/1259.html">1259</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Deleting a POD via a pointer to base</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1260">
     <td><a href="https://cplusplus.github.io/CWG/issues/1260.html">1260</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Incorrect use of term &#8220;overloaded&#8221; in description of odr-use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1261">
     <td><a href="https://cplusplus.github.io/CWG/issues/1261.html">1261</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD3</td>
     <td>Explicit handling of cv-qualification with non-class prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1262">
     <td><a href="https://cplusplus.github.io/CWG/issues/1262.html">1262</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Default template arguments and deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1263">
     <td><a href="https://cplusplus.github.io/CWG/issues/1263.html">1263</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Mismatch between rvalue reference binding and overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1264">
     <td><a href="https://cplusplus.github.io/CWG/issues/1264.html">1264</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Use of <TT>this</TT> in <TT>constexpr</TT> constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1265">
     <td><a href="https://cplusplus.github.io/CWG/issues/1265.html">1265</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Mixed use of the <TT>auto</TT> specifier</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr class="open" id="1266">
     <td><a href="https://cplusplus.github.io/CWG/issues/1266.html">1266</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td><I>user-defined-integer-literal</I> overflow</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1267">
     <td><a href="https://cplusplus.github.io/CWG/issues/1267.html">1267</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Rvalue reference types in <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1268">
     <td><a href="https://cplusplus.github.io/CWG/issues/1268.html">1268</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD3</td>
     <td><TT>reinterpret_cast</TT> of an xvalue operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1269">
     <td><a href="https://cplusplus.github.io/CWG/issues/1269.html">1269</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD3</td>
     <td><TT>dynamic_cast</TT> of an xvalue operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1270">
     <td><a href="https://cplusplus.github.io/CWG/issues/1270.html">1270</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Brace elision in array temporary initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1271">
     <td><a href="https://cplusplus.github.io/CWG/issues/1271.html">1271</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD5</td>
     <td>Imprecise wording regarding dependent types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1272">
     <td><a href="https://cplusplus.github.io/CWG/issues/1272.html">1272</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>Implicit definition of static data member of const literal type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1273">
     <td><a href="https://cplusplus.github.io/CWG/issues/1273.html">1273</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Accessibility and function signatures</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1274">
     <td><a href="https://cplusplus.github.io/CWG/issues/1274.html">1274</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD4</td>
     <td>Common nonterminal for <I>expression</I> and <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1275">
     <td><a href="https://cplusplus.github.io/CWG/issues/1275.html">1275</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD3</td>
     <td>Incorrect comment in example of template parameter pack restriction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1276">
     <td><a href="https://cplusplus.github.io/CWG/issues/1276.html">1276</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Reference to <TT>stdint.h</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1277">
     <td><a href="https://cplusplus.github.io/CWG/issues/1277.html">1277</a></td>
+    <td>[<a href="https://wg21.link/cstdint.syn">cstdint.syn</a>]</td>
     <td>NAD</td>
     <td>Lax definition of <TT>intmax_t</TT> and <TT>uintmax_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1278">
     <td><a href="https://cplusplus.github.io/CWG/issues/1278.html">1278</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>drafting</td>
     <td>Incorrect treatment of contrived object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1279">
     <td><a href="https://cplusplus.github.io/CWG/issues/1279.html">1279</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03">diff.cpp03</a>]</td>
     <td>open</td>
     <td>Additional differences between C++ 2003 and C++ 2011</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1280">
     <td><a href="https://cplusplus.github.io/CWG/issues/1280.html">1280</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Object reallocation and reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1281">
     <td><a href="https://cplusplus.github.io/CWG/issues/1281.html">1281</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Virtual and dependent base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1282">
     <td><a href="https://cplusplus.github.io/CWG/issues/1282.html">1282</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Underspecified destructor <I>exception-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1283">
     <td><a href="https://cplusplus.github.io/CWG/issues/1283.html">1283</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>open</td>
     <td>Static data members of classes with typedef name for linkage purposes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1284">
     <td><a href="https://cplusplus.github.io/CWG/issues/1284.html">1284</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Should the lifetime of an array be independent of that of its elements?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1285">
     <td><a href="https://cplusplus.github.io/CWG/issues/1285.html">1285</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Trivial destructors and object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1286">
     <td><a href="https://cplusplus.github.io/CWG/issues/1286.html">1286</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Equivalence of alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1287">
     <td><a href="https://cplusplus.github.io/CWG/issues/1287.html">1287</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++14</td>
     <td>Direct initialization vs &#8220;implicit&#8221; conversion in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1288">
     <td><a href="https://cplusplus.github.io/CWG/issues/1288.html">1288</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Reference list initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1289">
     <td><a href="https://cplusplus.github.io/CWG/issues/1289.html">1289</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Can an alias template name the current instantiation?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1290">
     <td><a href="https://cplusplus.github.io/CWG/issues/1290.html">1290</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Lifetime of the underlying array of an <TT>initializer_list</TT> member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1291">
     <td><a href="https://cplusplus.github.io/CWG/issues/1291.html">1291</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Looking up a <I>conversion-type-id</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1292">
     <td><a href="https://cplusplus.github.io/CWG/issues/1292.html">1292</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>Dependent calls with <I>braced-init-list</I>s containing a pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1293">
     <td><a href="https://cplusplus.github.io/CWG/issues/1293.html">1293</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>String literals in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1294">
     <td><a href="https://cplusplus.github.io/CWG/issues/1294.html">1294</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Side effects in dynamic/static initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1295">
     <td><a href="https://cplusplus.github.io/CWG/issues/1295.html">1295</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Binding a reference to an rvalue bit-field</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1296">
     <td><a href="https://cplusplus.github.io/CWG/issues/1296.html">1296</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD3</td>
     <td>Ill-formed template declarations (not just definitions)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1297">
     <td><a href="https://cplusplus.github.io/CWG/issues/1297.html">1297</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Misplaced function <I>attribute-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1298">
     <td><a href="https://cplusplus.github.io/CWG/issues/1298.html">1298</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Incorrect example in overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1299">
     <td><a href="https://cplusplus.github.io/CWG/issues/1299.html">1299</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD5</td>
     <td>&#8220;Temporary objects&#8221; vs &#8220;temporary expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1300">
     <td><a href="https://cplusplus.github.io/CWG/issues/1300.html">1300</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>dup</td>
     <td><TT>T()</TT> for array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1301">
     <td><a href="https://cplusplus.github.io/CWG/issues/1301.html">1301</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization of union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1302">
     <td><a href="https://cplusplus.github.io/CWG/issues/1302.html">1302</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td><TT>noexcept</TT> applied to expression of type <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1303">
     <td><a href="https://cplusplus.github.io/CWG/issues/1303.html">1303</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>C language linkage for template with internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1304">
     <td><a href="https://cplusplus.github.io/CWG/issues/1304.html">1304</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>drafting</td>
     <td>Omitted array bound with string initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1305">
     <td><a href="https://cplusplus.github.io/CWG/issues/1305.html">1305</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD3</td>
     <td><TT>alignof</TT> applied to array of unknown size</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1306">
     <td><a href="https://cplusplus.github.io/CWG/issues/1306.html">1306</a></td>
+    <td>[<a href="https://wg21.link/class.this">class.this</a>]</td>
     <td>CD3</td>
     <td>Modifying an object within a <TT>const</TT> member function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1307">
     <td><a href="https://cplusplus.github.io/CWG/issues/1307.html">1307</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>C++14</td>
     <td>Overload resolution based on size of array <I>initializer-list</I></td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1308">
     <td><a href="https://cplusplus.github.io/CWG/issues/1308.html">1308</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td>Completeness of class type within an <I>exception-specification</I></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1330">1330</a></td>
   </tr>
   <tr id="1309">
     <td><a href="https://cplusplus.github.io/CWG/issues/1309.html">1309</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Incorrect note regarding lookup of a member of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1310">
     <td><a href="https://cplusplus.github.io/CWG/issues/1310.html">1310</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD3</td>
     <td>What is an &#8220;acceptable lookup result?&#8221;</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="1311">
     <td><a href="https://cplusplus.github.io/CWG/issues/1311.html">1311</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Volatile lvalues in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1312">
     <td><a href="https://cplusplus.github.io/CWG/issues/1312.html">1312</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Simulated <TT>reinterpret_cast</TT> in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1313">
     <td><a href="https://cplusplus.github.io/CWG/issues/1313.html">1313</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Undefined pointer arithmetic in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1314">
     <td><a href="https://cplusplus.github.io/CWG/issues/1314.html">1314</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Pointer arithmetic within standard-layout objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1315">
     <td><a href="https://cplusplus.github.io/CWG/issues/1315.html">1315</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Restrictions on non-type template arguments in partial specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1316">
     <td><a href="https://cplusplus.github.io/CWG/issues/1316.html">1316</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> function requirements and class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1317">
     <td><a href="https://cplusplus.github.io/CWG/issues/1317.html">1317</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>Unnamed scoped enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1318">
     <td><a href="https://cplusplus.github.io/CWG/issues/1318.html">1318</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>Syntactic ambiguities with <TT>final</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1319">
     <td><a href="https://cplusplus.github.io/CWG/issues/1319.html">1319</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Error in pack expansion example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1320">
     <td><a href="https://cplusplus.github.io/CWG/issues/1320.html">1320</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td>Converting scoped enumerations to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1321">
     <td><a href="https://cplusplus.github.io/CWG/issues/1321.html">1321</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>CD3</td>
     <td>Equivalency of dependent calls</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1322">
     <td><a href="https://cplusplus.github.io/CWG/issues/1322.html">1322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>Function parameter type decay in templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1323">
     <td><a href="https://cplusplus.github.io/CWG/issues/1323.html">1323</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Nonexistent nonterminal in <I>alignment-specifier</I> grammar</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1324">
     <td><a href="https://cplusplus.github.io/CWG/issues/1324.html">1324</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization and defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1325">
     <td><a href="https://cplusplus.github.io/CWG/issues/1325.html">1325</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Omitted declarator in <TT>friend</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1326">
     <td><a href="https://cplusplus.github.io/CWG/issues/1326.html">1326</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>dup</td>
     <td>Deducing an array bound from an <I>initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1327">
     <td><a href="https://cplusplus.github.io/CWG/issues/1327.html">1327</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td><I>virt-specifier</I> in a defaulted definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1328">
     <td><a href="https://cplusplus.github.io/CWG/issues/1328.html">1328</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Conflict in reference binding vs overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1329">
     <td><a href="https://cplusplus.github.io/CWG/issues/1329.html">1329</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD3</td>
     <td>Recursive deduction substitutions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1330">
     <td><a href="https://cplusplus.github.io/CWG/issues/1330.html">1330</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Delayed instantiation of <TT>noexcept</TT> specifiers</td>
     <td class="full" align="center">Clang 4 (C++11 onwards)</td>
   </tr>
   <tr id="1331">
     <td><a href="https://cplusplus.github.io/CWG/issues/1331.html">1331</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td><TT>const</TT> mismatch with defaulted copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1332">
     <td><a href="https://cplusplus.github.io/CWG/issues/1332.html">1332</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD5</td>
     <td>Handling of invalid universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1333">
     <td><a href="https://cplusplus.github.io/CWG/issues/1333.html">1333</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td>Omission of <TT>const</TT> in a defaulted copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1334">
     <td><a href="https://cplusplus.github.io/CWG/issues/1334.html">1334</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Layout compatibility and cv-qualification</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1719">1719</a></td>
   </tr>
   <tr id="1335">
     <td><a href="https://cplusplus.github.io/CWG/issues/1335.html">1335</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD6</td>
     <td>Stringizing, extended characters, and universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1336">
     <td><a href="https://cplusplus.github.io/CWG/issues/1336.html">1336</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;converting constructor&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1337">
     <td><a href="https://cplusplus.github.io/CWG/issues/1337.html">1337</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>dup</td>
     <td>Partial ordering and non-deduced parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1338">
     <td><a href="https://cplusplus.github.io/CWG/issues/1338.html">1338</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD4</td>
     <td>Aliasing and allocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1339">
     <td><a href="https://cplusplus.github.io/CWG/issues/1339.html">1339</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Parenthesized <I>braced-init-list</I> and arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1340">
     <td><a href="https://cplusplus.github.io/CWG/issues/1340.html">1340</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>CD3</td>
     <td>Complete type in member pointer expressions</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1341">
     <td><a href="https://cplusplus.github.io/CWG/issues/1341.html">1341</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td>Bit-field initializers</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P0683R1">P0683R1</a></td>
   </tr>
   <tr id="1342">
     <td><a href="https://cplusplus.github.io/CWG/issues/1342.html">1342</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD6</td>
     <td>Order of initialization with multiple declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1343">
     <td><a href="https://cplusplus.github.io/CWG/issues/1343.html">1343</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++17</td>
     <td>Sequencing of non-class initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1344">
     <td><a href="https://cplusplus.github.io/CWG/issues/1344.html">1344</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Adding new special member functions to a class via default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1345">
     <td><a href="https://cplusplus.github.io/CWG/issues/1345.html">1345</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD3</td>
     <td>Initialization of anonymous union class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1346">
     <td><a href="https://cplusplus.github.io/CWG/issues/1346.html">1346</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td><I>expression-list</I> initializers and the <TT>auto</TT> specifier</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1347">
     <td><a href="https://cplusplus.github.io/CWG/issues/1347.html">1347</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Consistency of <TT>auto</TT> in multiple-declarator declarations</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="1348">
     <td><a href="https://cplusplus.github.io/CWG/issues/1348.html">1348</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>drafting</td>
     <td>Use of <TT>auto</TT> in a <I>trailing-return-type</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1349">
     <td><a href="https://cplusplus.github.io/CWG/issues/1349.html">1349</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>dup</td>
     <td>Consistency of alias template redeclarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1350">
     <td><a href="https://cplusplus.github.io/CWG/issues/1350.html">1350</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD3</td>
     <td>Incorrect exception specification for inherited constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1351">
     <td><a href="https://cplusplus.github.io/CWG/issues/1351.html">1351</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Problems with implicitly-declared <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1352">
     <td><a href="https://cplusplus.github.io/CWG/issues/1352.html">1352</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD3</td>
     <td>Inconsistent class scope and completeness rules</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1353">
     <td><a href="https://cplusplus.github.io/CWG/issues/1353.html">1353</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD7</td>
     <td>Array and variant members and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1354">
     <td><a href="https://cplusplus.github.io/CWG/issues/1354.html">1354</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD3</td>
     <td>Destructor exceptions for temporaries in noexcept expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1355">
     <td><a href="https://cplusplus.github.io/CWG/issues/1355.html">1355</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td>Aggregates and &#8220;user-provided&#8221; constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1356">
     <td><a href="https://cplusplus.github.io/CWG/issues/1356.html">1356</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Exception specifications of copy assignment operators with virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1357">
     <td><a href="https://cplusplus.github.io/CWG/issues/1357.html">1357</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td><I>brace-or-equal-initializer</I>s for function and typedef members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1358">
     <td><a href="https://cplusplus.github.io/CWG/issues/1358.html">1358</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Unintentionally ill-formed <TT>constexpr</TT> function template instances</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1359">
     <td><a href="https://cplusplus.github.io/CWG/issues/1359.html">1359</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> union constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1360">
     <td><a href="https://cplusplus.github.io/CWG/issues/1360.html">1360</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD6</td>
     <td><TT>constexpr</TT> defaulted default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1361">
     <td><a href="https://cplusplus.github.io/CWG/issues/1361.html">1361</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Requirement on <I>brace-or-equal-initializer</I>s of literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1362">
     <td><a href="https://cplusplus.github.io/CWG/issues/1362.html">1362</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Complete type required for implicit conversion to <TT>T&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1363">
     <td><a href="https://cplusplus.github.io/CWG/issues/1363.html">1363</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>Triviality vs multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1364">
     <td><a href="https://cplusplus.github.io/CWG/issues/1364.html">1364</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1365">
     <td><a href="https://cplusplus.github.io/CWG/issues/1365.html">1365</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Calling undefined <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1366">
     <td><a href="https://cplusplus.github.io/CWG/issues/1366.html">1366</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Deleted <TT>constexpr</TT> constructors and virtual base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1367">
     <td><a href="https://cplusplus.github.io/CWG/issues/1367.html">1367</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Use of <TT>this</TT> in a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1368">
     <td><a href="https://cplusplus.github.io/CWG/issues/1368.html">1368</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization and defaulted constructors (part 2)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1369">
     <td><a href="https://cplusplus.github.io/CWG/issues/1369.html">1369</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Function invocation substitution of <TT>this</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1370">
     <td><a href="https://cplusplus.github.io/CWG/issues/1370.html">1370</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>CD3</td>
     <td><I>identifier-list</I> cannot contain ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1371">
     <td><a href="https://cplusplus.github.io/CWG/issues/1371.html">1371</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Deduction from <TT>T&amp;&amp;</TT> in return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1372">
     <td><a href="https://cplusplus.github.io/CWG/issues/1372.html">1372</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD3</td>
     <td>Cross-references incorrect in conversion function template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1373">
     <td><a href="https://cplusplus.github.io/CWG/issues/1373.html">1373</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>dup</td>
     <td>Overload resolution changes matching reference-binding changes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1374">
     <td><a href="https://cplusplus.github.io/CWG/issues/1374.html">1374</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Qualification conversion vs difference in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1375">
     <td><a href="https://cplusplus.github.io/CWG/issues/1375.html">1375</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD3</td>
     <td>Reference to anonymous union?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1376">
     <td><a href="https://cplusplus.github.io/CWG/issues/1376.html">1376</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++14</td>
     <td><TT>static_cast</TT> of temporary to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1377">
     <td><a href="https://cplusplus.github.io/CWG/issues/1377.html">1377</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03">diff.cpp03</a>]</td>
     <td>dup</td>
     <td>Access declarations not mentioned in Annex C</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1378">
     <td><a href="https://cplusplus.github.io/CWG/issues/1378.html">1378</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD5</td>
     <td>When is an instantiation required?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1379">
     <td><a href="https://cplusplus.github.io/CWG/issues/1379.html">1379</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Is <TT>std::initializer_list</TT> an aggregate?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1380">
     <td><a href="https://cplusplus.github.io/CWG/issues/1380.html">1380</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>Type definitions in <I>template-parameter</I> <I>parameter-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1381">
     <td><a href="https://cplusplus.github.io/CWG/issues/1381.html">1381</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Implicitly-declared special member functions and default <TT>nothrow</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1382">
     <td><a href="https://cplusplus.github.io/CWG/issues/1382.html">1382</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Dead code for constructor names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1383">
     <td><a href="https://cplusplus.github.io/CWG/issues/1383.html">1383</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD3</td>
     <td>Clarifying discarded-value expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1384">
     <td><a href="https://cplusplus.github.io/CWG/issues/1384.html">1384</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1385">
     <td><a href="https://cplusplus.github.io/CWG/issues/1385.html">1385</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD3</td>
     <td>Syntactic forms of conversion functions for surrogate call functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1386">
     <td><a href="https://cplusplus.github.io/CWG/issues/1386.html">1386</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicitly-specified partial argument list with multiple parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1387">
     <td><a href="https://cplusplus.github.io/CWG/issues/1387.html">1387</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD3</td>
     <td>Missing non-deduced context for <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1388">
     <td><a href="https://cplusplus.github.io/CWG/issues/1388.html">1388</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD3</td>
     <td>Missing non-deduced context following a function parameter pack</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1389">
     <td><a href="https://cplusplus.github.io/CWG/issues/1389.html">1389</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>NAD</td>
     <td>Recursive reference in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1390">
     <td><a href="https://cplusplus.github.io/CWG/issues/1390.html">1390</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Dependency of alias template specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1391">
     <td><a href="https://cplusplus.github.io/CWG/issues/1391.html">1391</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD4</td>
     <td>Conversions to parameter types with non-deduced template arguments</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1392">
     <td><a href="https://cplusplus.github.io/CWG/issues/1392.html">1392</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>CD3</td>
     <td>Explicit conversion functions for references and non-references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1393">
     <td><a href="https://cplusplus.github.io/CWG/issues/1393.html">1393</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++17</td>
     <td>Pack expansions in <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1394">
     <td><a href="https://cplusplus.github.io/CWG/issues/1394.html">1394</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>Incomplete types as parameters of deleted functions</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr id="1395">
     <td><a href="https://cplusplus.github.io/CWG/issues/1395.html">1395</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++17</td>
     <td>Partial ordering of variadic templates reconsidered</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="1396">
     <td><a href="https://cplusplus.github.io/CWG/issues/1396.html">1396</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>C++23</td>
     <td>Deferred instantiation and checking of non-static data member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1397">
     <td><a href="https://cplusplus.github.io/CWG/issues/1397.html">1397</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Class completeness in non-static data member initializers</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="1398">
     <td><a href="https://cplusplus.github.io/CWG/issues/1398.html">1398</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD3</td>
     <td>Non-type template parameters of type <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1399">
     <td><a href="https://cplusplus.github.io/CWG/issues/1399.html">1399</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD3</td>
     <td>Deduction with multiple function parameter packs</td>
     <td class="full" align="center">Duplicate of <a href="#1388">1388</a></td>
   </tr>
   <tr id="1400">
     <td><a href="https://cplusplus.github.io/CWG/issues/1400.html">1400</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>NAD</td>
     <td>Function pointer equality</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1401">
     <td><a href="https://cplusplus.github.io/CWG/issues/1401.html">1401</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Similar types and reference compatibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1402">
     <td><a href="https://cplusplus.github.io/CWG/issues/1402.html">1402</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Move functions too often deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1403">
     <td><a href="https://cplusplus.github.io/CWG/issues/1403.html">1403</a></td>
+    <td>[<a href="https://wg21.link/lex.comment">lex.comment</a>]</td>
     <td>CD6</td>
     <td>Universal-character-names in comments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1404">
     <td><a href="https://cplusplus.github.io/CWG/issues/1404.html">1404</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>open</td>
     <td>Object reallocation in unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1405">
     <td><a href="https://cplusplus.github.io/CWG/issues/1405.html">1405</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> and mutable members of literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1406">
     <td><a href="https://cplusplus.github.io/CWG/issues/1406.html">1406</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD3</td>
     <td><I>ref-qualifier</I>s and added parameters of non-static member function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1407">
     <td><a href="https://cplusplus.github.io/CWG/issues/1407.html">1407</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Integral to <TT>bool</TT> conversion in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1408">
     <td><a href="https://cplusplus.github.io/CWG/issues/1408.html">1408</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>What is &#8220;the same aggregate initialization?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1409">
     <td><a href="https://cplusplus.github.io/CWG/issues/1409.html">1409</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD3</td>
     <td>What is the second standard conversion sequence of a list-initialization sequence?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1410">
     <td><a href="https://cplusplus.github.io/CWG/issues/1410.html">1410</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Reference overload tiebreakers should apply to rvalue references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1411">
     <td><a href="https://cplusplus.github.io/CWG/issues/1411.html">1411</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>More on global scope <TT>::</TT> in <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1412">
     <td><a href="https://cplusplus.github.io/CWG/issues/1412.html">1412</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td>Problems in specifying pointer conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1413">
     <td><a href="https://cplusplus.github.io/CWG/issues/1413.html">1413</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD3</td>
     <td>Missing cases of value-dependency</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr class="open" id="1414">
     <td><a href="https://cplusplus.github.io/CWG/issues/1414.html">1414</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>drafting</td>
     <td>Binding an rvalue reference to a reference-unrelated lvalue</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1415">
     <td><a href="https://cplusplus.github.io/CWG/issues/1415.html">1415</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD3</td>
     <td>Missing prohibition of block-scope definition of <TT>extern</TT> object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1416">
     <td><a href="https://cplusplus.github.io/CWG/issues/1416.html">1416</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD3</td>
     <td>Function cv-qualifiers and <TT>typeid</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1417">
     <td><a href="https://cplusplus.github.io/CWG/issues/1417.html">1417</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++14</td>
     <td>Pointers/references to functions with cv-qualifiers or <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1418">
     <td><a href="https://cplusplus.github.io/CWG/issues/1418.html">1418</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Type of <TT>initializer_list</TT> backing array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1419">
     <td><a href="https://cplusplus.github.io/CWG/issues/1419.html">1419</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Evaluation order in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1420">
     <td><a href="https://cplusplus.github.io/CWG/issues/1420.html">1420</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>NAD</td>
     <td>Abstract final classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1421">
     <td><a href="https://cplusplus.github.io/CWG/issues/1421.html">1421</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Full expressions and aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1422">
     <td><a href="https://cplusplus.github.io/CWG/issues/1422.html">1422</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>dup</td>
     <td>Type of character literals containing universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1423">
     <td><a href="https://cplusplus.github.io/CWG/issues/1423.html">1423</a></td>
+    <td>[<a href="https://wg21.link/conv.fctptr">conv.fctptr</a>]</td>
     <td>CD3</td>
     <td>Convertibility of <TT>nullptr</TT> to <TT>bool</TT></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1424">
     <td><a href="https://cplusplus.github.io/CWG/issues/1424.html">1424</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++14</td>
     <td>When must sub-object destructors be accessible?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1425">
     <td><a href="https://cplusplus.github.io/CWG/issues/1425.html">1425</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td>Base-class subobjects of standard-layout structs</td>
     <td class="na" align="center">N/A (ABI constraint)</td>
   </tr>
   <tr id="1426">
     <td><a href="https://cplusplus.github.io/CWG/issues/1426.html">1426</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td>Allowing additional parameter types in defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1427">
     <td><a href="https://cplusplus.github.io/CWG/issues/1427.html">1427</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>NAD</td>
     <td>Default constructor and deleted or inaccessible destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1428">
     <td><a href="https://cplusplus.github.io/CWG/issues/1428.html">1428</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>CD3</td>
     <td>Dynamic const objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1429">
     <td><a href="https://cplusplus.github.io/CWG/issues/1429.html">1429</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.temp">basic.scope.temp</a>]</td>
     <td>NAD</td>
     <td>Scope of a member template's template parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1430">
     <td><a href="https://cplusplus.github.io/CWG/issues/1430.html">1430</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Pack expansion into fixed alias template parameter list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1431">
     <td><a href="https://cplusplus.github.io/CWG/issues/1431.html">1431</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>CD3</td>
     <td>Exceptions from other than <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1432">
     <td><a href="https://cplusplus.github.io/CWG/issues/1432.html">1432</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>open</td>
     <td>Newly-ambiguous variadic template expansions</td>
     <td align="center">
@@ -8419,912 +9809,1064 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1433">
     <td><a href="https://cplusplus.github.io/CWG/issues/1433.html">1433</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>NAD</td>
     <td><I>trailing-return-type</I> and point of declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1434">
     <td><a href="https://cplusplus.github.io/CWG/issues/1434.html">1434</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Parenthesized <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1435">
     <td><a href="https://cplusplus.github.io/CWG/issues/1435.html">1435</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD3</td>
     <td><I>template-id</I> as the declarator for a class template constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1436">
     <td><a href="https://cplusplus.github.io/CWG/issues/1436.html">1436</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Interaction of constant expression changes with preprocessor expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1437">
     <td><a href="https://cplusplus.github.io/CWG/issues/1437.html">1437</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD3</td>
     <td><TT>alignas</TT> in <I>alias-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1438">
     <td><a href="https://cplusplus.github.io/CWG/issues/1438.html">1438</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD3</td>
     <td>Non-dereference use of invalid pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1439">
     <td><a href="https://cplusplus.github.io/CWG/issues/1439.html">1439</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD3</td>
     <td>Lookup and friend template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1440">
     <td><a href="https://cplusplus.github.io/CWG/issues/1440.html">1440</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD3</td>
     <td>Acceptable <I>decltype-specifier</I>s used as <I>nested-name-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1441">
     <td><a href="https://cplusplus.github.io/CWG/issues/1441.html">1441</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++14</td>
     <td>Unclear wording for signal handler restrictions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1442">
     <td><a href="https://cplusplus.github.io/CWG/issues/1442.html">1442</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD3</td>
     <td>Argument-dependent lookup in the range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1443">
     <td><a href="https://cplusplus.github.io/CWG/issues/1443.html">1443</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>NAD</td>
     <td>Default arguments and non-static data members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1444">
     <td><a href="https://cplusplus.github.io/CWG/issues/1444.html">1444</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>Type adjustments of non-type template parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1445">
     <td><a href="https://cplusplus.github.io/CWG/issues/1445.html">1445</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>dup</td>
     <td>Argument-dependent lookup of <TT>begin</TT> and <TT>end</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1446">
     <td><a href="https://cplusplus.github.io/CWG/issues/1446.html">1446</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD4</td>
     <td>Member function with no <I>ref-qualifier</I> and non-member function with rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1447">
     <td><a href="https://cplusplus.github.io/CWG/issues/1447.html">1447</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td><TT>static_cast</TT> of bit-field lvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1448">
     <td><a href="https://cplusplus.github.io/CWG/issues/1448.html">1448</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Integral values of type <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1449">
     <td><a href="https://cplusplus.github.io/CWG/issues/1449.html">1449</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Narrowing conversion of negative value to unsigned type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1450">
     <td><a href="https://cplusplus.github.io/CWG/issues/1450.html">1450</a></td>
+    <td>[<a href="https://wg21.link/expr.mul">expr.mul</a>]</td>
     <td>CD3</td>
     <td><TT>INT_MIN % -1</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1451">
     <td><a href="https://cplusplus.github.io/CWG/issues/1451.html">1451</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD4</td>
     <td>Objects with no linkage in non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1452">
     <td><a href="https://cplusplus.github.io/CWG/issues/1452.html">1452</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Value-initialized objects may be constants</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1453">
     <td><a href="https://cplusplus.github.io/CWG/issues/1453.html">1453</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Volatile members in literal classes?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1454">
     <td><a href="https://cplusplus.github.io/CWG/issues/1454.html">1454</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Passing constants through <TT>constexpr</TT> functions via references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1455">
     <td><a href="https://cplusplus.github.io/CWG/issues/1455.html">1455</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Lvalue converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1456">
     <td><a href="https://cplusplus.github.io/CWG/issues/1456.html">1456</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Address constant expression designating the one-past-the-end address</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1457">
     <td><a href="https://cplusplus.github.io/CWG/issues/1457.html">1457</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD3</td>
     <td>Undefined behavior in left-shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1458">
     <td><a href="https://cplusplus.github.io/CWG/issues/1458.html">1458</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD3</td>
     <td>Address of incomplete type vs <TT>operator&amp;()</TT></td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="1459">
     <td><a href="https://cplusplus.github.io/CWG/issues/1459.html">1459</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Reference-binding tiebreakers in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1460">
     <td><a href="https://cplusplus.github.io/CWG/issues/1460.html">1460</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>C++14</td>
     <td>What is an empty union?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1461">
     <td><a href="https://cplusplus.github.io/CWG/issues/1461.html">1461</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Narrowing conversions to bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1462">
     <td><a href="https://cplusplus.github.io/CWG/issues/1462.html">1462</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Deduction failure vs &#8220;ill-formed, no diagnostic required&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1463">
     <td><a href="https://cplusplus.github.io/CWG/issues/1463.html">1463</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>drafting</td>
     <td><TT>extern "C"</TT> alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1464">
     <td><a href="https://cplusplus.github.io/CWG/issues/1464.html">1464</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>Negative array bound in a <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1465">
     <td><a href="https://cplusplus.github.io/CWG/issues/1465.html">1465</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD4</td>
     <td><TT>noexcept</TT> and <TT>std::bad_array_new_length</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1466">
     <td><a href="https://cplusplus.github.io/CWG/issues/1466.html">1466</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++14</td>
     <td>Visible sequences of side effects are redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1467">
     <td><a href="https://cplusplus.github.io/CWG/issues/1467.html">1467</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization of aggregate from same-type object</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1468">
     <td><a href="https://cplusplus.github.io/CWG/issues/1468.html">1468</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td><TT>typeid</TT>, overload resolution, and implicit lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1469">
     <td><a href="https://cplusplus.github.io/CWG/issues/1469.html">1469</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Omitted bound in array <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1470">
     <td><a href="https://cplusplus.github.io/CWG/issues/1470.html">1470</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Thread migration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1471">
     <td><a href="https://cplusplus.github.io/CWG/issues/1471.html">1471</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD3</td>
     <td>Nested type of non-dependent base</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1472">
     <td><a href="https://cplusplus.github.io/CWG/issues/1472.html">1472</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>odr-use of reference variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1473">
     <td><a href="https://cplusplus.github.io/CWG/issues/1473.html">1473</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD3</td>
     <td>Syntax of <I>literal-operator-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1474">
     <td><a href="https://cplusplus.github.io/CWG/issues/1474.html">1474</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>User-defined literals and <TT>&lt;inttypes.h&gt;</TT> format macros</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1475">
     <td><a href="https://cplusplus.github.io/CWG/issues/1475.html">1475</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.depend">dcl.attr.depend</a>]</td>
     <td>CD3</td>
     <td>Errors in <TT>[[carries_dependency]]</TT> example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1476">
     <td><a href="https://cplusplus.github.io/CWG/issues/1476.html">1476</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of user-defined type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1477">
     <td><a href="https://cplusplus.github.io/CWG/issues/1477.html">1477</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD3</td>
     <td>Definition of a <TT>friend</TT> outside its namespace</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1478">
     <td><a href="https://cplusplus.github.io/CWG/issues/1478.html">1478</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD6</td>
     <td><TT>template</TT> keyword for dependent template template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1479">
     <td><a href="https://cplusplus.github.io/CWG/issues/1479.html">1479</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD3</td>
     <td>Literal operators and default arguments</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1480">
     <td><a href="https://cplusplus.github.io/CWG/issues/1480.html">1480</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Constant initialization via non-constant temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1481">
     <td><a href="https://cplusplus.github.io/CWG/issues/1481.html">1481</a></td>
+    <td>[<a href="https://wg21.link/over.inc">over.inc</a>]</td>
     <td>CD3</td>
     <td>Increment/decrement operators with reference parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1482">
     <td><a href="https://cplusplus.github.io/CWG/issues/1482.html">1482</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>CD3</td>
     <td>Point of declaration of enumeration</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1483">
     <td><a href="https://cplusplus.github.io/CWG/issues/1483.html">1483</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Non-dependent <I>static_assert-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1484">
     <td><a href="https://cplusplus.github.io/CWG/issues/1484.html">1484</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD4</td>
     <td>Unused local classes of function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1485">
     <td><a href="https://cplusplus.github.io/CWG/issues/1485.html">1485</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>drafting</td>
     <td>Out-of-class definition of member unscoped opaque enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1486">
     <td><a href="https://cplusplus.github.io/CWG/issues/1486.html">1486</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.funcaddr">temp.deduct.funcaddr</a>]</td>
     <td>drafting</td>
     <td>Base-derived conversion in member pointer deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1487">
     <td><a href="https://cplusplus.github.io/CWG/issues/1487.html">1487</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD3</td>
     <td>When are inheriting constructors declared?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="1488">
     <td><a href="https://cplusplus.github.io/CWG/issues/1488.html">1488</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>drafting</td>
     <td><I>abstract-pack-declarator</I>s in <I>type-id</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1489">
     <td><a href="https://cplusplus.github.io/CWG/issues/1489.html">1489</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD3</td>
     <td>Is value-initialization of an array constant initialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1490">
     <td><a href="https://cplusplus.github.io/CWG/issues/1490.html">1490</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization from a string literal</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1491">
     <td><a href="https://cplusplus.github.io/CWG/issues/1491.html">1491</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Move construction and rvalue reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1492">
     <td><a href="https://cplusplus.github.io/CWG/issues/1492.html">1492</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Exception specifications on template destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1493">
     <td><a href="https://cplusplus.github.io/CWG/issues/1493.html">1493</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Criteria for move-construction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1494">
     <td><a href="https://cplusplus.github.io/CWG/issues/1494.html">1494</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Temporary initialization for reference binding in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1495">
     <td><a href="https://cplusplus.github.io/CWG/issues/1495.html">1495</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD3</td>
     <td>Partial specialization of variadic class template</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1496">
     <td><a href="https://cplusplus.github.io/CWG/issues/1496.html">1496</a></td>
+    <td>[<a href="https://wg21.link/class.name">class.name</a>]</td>
     <td>CD4</td>
     <td>Triviality with deleted and missing default constructors</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1497">
     <td><a href="https://cplusplus.github.io/CWG/issues/1497.html">1497</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Aggregate initialization with parenthesized string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1498">
     <td><a href="https://cplusplus.github.io/CWG/issues/1498.html">1498</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>dup</td>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1499">
     <td><a href="https://cplusplus.github.io/CWG/issues/1499.html">1499</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD7</td>
     <td>Missing case for deleted move assignment operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1500">
     <td><a href="https://cplusplus.github.io/CWG/issues/1500.html">1500</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD6</td>
     <td>Name lookup of dependent conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1501">
     <td><a href="https://cplusplus.github.io/CWG/issues/1501.html">1501</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Nested braces in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1502">
     <td><a href="https://cplusplus.github.io/CWG/issues/1502.html">1502</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization of unions with member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1503">
     <td><a href="https://cplusplus.github.io/CWG/issues/1503.html">1503</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD3</td>
     <td>Exceptions during copy to exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1504">
     <td><a href="https://cplusplus.github.io/CWG/issues/1504.html">1504</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD3</td>
     <td>Pointer arithmetic after derived-base conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1505">
     <td><a href="https://cplusplus.github.io/CWG/issues/1505.html">1505</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>dup</td>
     <td>Direct binding of reference to temporary in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1506">
     <td><a href="https://cplusplus.github.io/CWG/issues/1506.html">1506</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Value category of <TT>initializer_list</TT> object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1507">
     <td><a href="https://cplusplus.github.io/CWG/issues/1507.html">1507</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization with trivial inaccessible default constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1508">
     <td><a href="https://cplusplus.github.io/CWG/issues/1508.html">1508</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++14</td>
     <td>Template initializer-list constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1509">
     <td><a href="https://cplusplus.github.io/CWG/issues/1509.html">1509</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>C++14</td>
     <td>Definition of &#8220;non-template function&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1510">
     <td><a href="https://cplusplus.github.io/CWG/issues/1510.html">1510</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD3</td>
     <td>cv-qualified references via <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1511">
     <td><a href="https://cplusplus.github.io/CWG/issues/1511.html">1511</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td><TT>const volatile</TT> variables and the one-definition rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1512">
     <td><a href="https://cplusplus.github.io/CWG/issues/1512.html">1512</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD3</td>
     <td>Pointer comparison vs qualification conversions</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr class="open" id="1513">
     <td><a href="https://cplusplus.github.io/CWG/issues/1513.html">1513</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>drafting</td>
     <td><TT>initializer_list</TT> deduction failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1514">
     <td><a href="https://cplusplus.github.io/CWG/issues/1514.html">1514</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>C++14</td>
     <td>Ambiguity between enumeration definition and zero-length bit-field</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1515">
     <td><a href="https://cplusplus.github.io/CWG/issues/1515.html">1515</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Modulo 2<SUP><I>n</I></SUP> arithmetic for implicitly-unsigned types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1516">
     <td><a href="https://cplusplus.github.io/CWG/issues/1516.html">1516</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;virtual function call&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1517">
     <td><a href="https://cplusplus.github.io/CWG/issues/1517.html">1517</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>open</td>
     <td>Unclear/missing description of behavior during construction/destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1518">
     <td><a href="https://cplusplus.github.io/CWG/issues/1518.html">1518</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Explicit default constructors and copy-list-initialization</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1519">
     <td><a href="https://cplusplus.github.io/CWG/issues/1519.html">1519</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>NAD</td>
     <td>Conflicting default and variadic constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1520">
     <td><a href="https://cplusplus.github.io/CWG/issues/1520.html">1520</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>NAD</td>
     <td>Alias template specialization vs pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1521">
     <td><a href="https://cplusplus.github.io/CWG/issues/1521.html">1521</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>dup</td>
     <td><TT>T{</TT><I>expr</I><TT>}</TT> with reference types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1522">
     <td><a href="https://cplusplus.github.io/CWG/issues/1522.html">1522</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Access checking for <TT>initializer_list</TT> array initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1523">
     <td><a href="https://cplusplus.github.io/CWG/issues/1523.html">1523</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD5</td>
     <td>Point of declaration in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1524">
     <td><a href="https://cplusplus.github.io/CWG/issues/1524.html">1524</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Incompletely-defined class template base</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1525">
     <td><a href="https://cplusplus.github.io/CWG/issues/1525.html">1525</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>NAD</td>
     <td>Array bound inference in temporary array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1526">
     <td><a href="https://cplusplus.github.io/CWG/issues/1526.html">1526</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>dup</td>
     <td>Dependent-class lookup in the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1527">
     <td><a href="https://cplusplus.github.io/CWG/issues/1527.html">1527</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD3</td>
     <td>Assignment from <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1528">
     <td><a href="https://cplusplus.github.io/CWG/issues/1528.html">1528</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Repeated <I>cv-qualifier</I>s in declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1529">
     <td><a href="https://cplusplus.github.io/CWG/issues/1529.html">1529</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>drafting</td>
     <td>Nomenclature for variable vs reference non-static data member</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1530">
     <td><a href="https://cplusplus.github.io/CWG/issues/1530.html">1530</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>drafting</td>
     <td>Member access in out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1531">
     <td><a href="https://cplusplus.github.io/CWG/issues/1531.html">1531</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;access&#8221; (verb)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1532">
     <td><a href="https://cplusplus.github.io/CWG/issues/1532.html">1532</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD3</td>
     <td>Explicit instantiation and member templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1533">
     <td><a href="https://cplusplus.github.io/CWG/issues/1533.html">1533</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>CD3</td>
     <td>Function pack expansion for member initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1534">
     <td><a href="https://cplusplus.github.io/CWG/issues/1534.html">1534</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>dup</td>
     <td>cv-qualification of prvalue of type &#8220;array of class&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1535">
     <td><a href="https://cplusplus.github.io/CWG/issues/1535.html">1535</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td><TT>typeid</TT> in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1536">
     <td><a href="https://cplusplus.github.io/CWG/issues/1536.html">1536</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>drafting</td>
     <td>Overload resolution with temporary from initializer list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1537">
     <td><a href="https://cplusplus.github.io/CWG/issues/1537.html">1537</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Optional compile-time evaluation of constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1538">
     <td><a href="https://cplusplus.github.io/CWG/issues/1538.html">1538</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD3</td>
     <td>C-style cast in <I>braced-init-list</I> assignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1539">
     <td><a href="https://cplusplus.github.io/CWG/issues/1539.html">1539</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;character type&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1540">
     <td><a href="https://cplusplus.github.io/CWG/issues/1540.html">1540</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Use of address constants in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1541">
     <td><a href="https://cplusplus.github.io/CWG/issues/1541.html">1541</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD3</td>
     <td><I>cv</I> <TT>void</TT> return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1542">
     <td><a href="https://cplusplus.github.io/CWG/issues/1542.html">1542</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>open</td>
     <td>Compound assignment of <I>braced-init-list</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1543">
     <td><a href="https://cplusplus.github.io/CWG/issues/1543.html">1543</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD3</td>
     <td>Implicit conversion sequence for empty initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1544">
     <td><a href="https://cplusplus.github.io/CWG/issues/1544.html">1544</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD3</td>
     <td>Linkage of member of unnamed namespace</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1545">
     <td><a href="https://cplusplus.github.io/CWG/issues/1545.html">1545</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td><TT>friend</TT> function templates defined in class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1546">
     <td><a href="https://cplusplus.github.io/CWG/issues/1546.html">1546</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Errors in function template default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1547">
     <td><a href="https://cplusplus.github.io/CWG/issues/1547.html">1547</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td><TT>typename</TT> keyword in <I>alias-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1548">
     <td><a href="https://cplusplus.github.io/CWG/issues/1548.html">1548</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Copy/move construction and conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1549">
     <td><a href="https://cplusplus.github.io/CWG/issues/1549.html">1549</a></td>
+    <td>[<a href="https://wg21.link/over.binary">over.binary</a>]</td>
     <td>open</td>
     <td>Overloaded comma operator with <TT>void</TT> operand</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1550">
     <td><a href="https://cplusplus.github.io/CWG/issues/1550.html">1550</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD3</td>
     <td>Parenthesized <I>throw-expression</I> operand of <I>conditional-expression</I></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1551">
     <td><a href="https://cplusplus.github.io/CWG/issues/1551.html">1551</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>C++14</td>
     <td>Wording problems in <I>using-declaration</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1552">
     <td><a href="https://cplusplus.github.io/CWG/issues/1552.html">1552</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and defaulted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1553">
     <td><a href="https://cplusplus.github.io/CWG/issues/1553.html">1553</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>CD3</td>
     <td><TT>sizeof</TT> and xvalue bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1554">
     <td><a href="https://cplusplus.github.io/CWG/issues/1554.html">1554</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Access and alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1555">
     <td><a href="https://cplusplus.github.io/CWG/issues/1555.html">1555</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>NAD</td>
     <td>Language linkage and function type compatibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1556">
     <td><a href="https://cplusplus.github.io/CWG/issues/1556.html">1556</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD3</td>
     <td>Constructors and explicit conversion functions in direct initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1557">
     <td><a href="https://cplusplus.github.io/CWG/issues/1557.html">1557</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD3</td>
     <td>Language linkage of converted lambda function pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1558">
     <td><a href="https://cplusplus.github.io/CWG/issues/1558.html">1558</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD4</td>
     <td>Unused arguments in alias template specializations</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="1559">
     <td><a href="https://cplusplus.github.io/CWG/issues/1559.html">1559</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>String too long in initializer list of <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1560">
     <td><a href="https://cplusplus.github.io/CWG/issues/1560.html">1560</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD3</td>
     <td>Gratuitous lvalue-to-rvalue conversion in <I>conditional-expression</I> with <I>throw-expression</I> operand</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1561">
     <td><a href="https://cplusplus.github.io/CWG/issues/1561.html">1561</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD4</td>
     <td>Aggregates with empty base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1562">
     <td><a href="https://cplusplus.github.io/CWG/issues/1562.html">1562</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++14</td>
     <td>Non-static data member initializers and union <I>ctor-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1563">
     <td><a href="https://cplusplus.github.io/CWG/issues/1563.html">1563</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD3</td>
     <td>List-initialization and overloaded function disambiguation</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1564">
     <td><a href="https://cplusplus.github.io/CWG/issues/1564.html">1564</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Template argument deduction from an initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1565">
     <td><a href="https://cplusplus.github.io/CWG/issues/1565.html">1565</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Copy elision and lifetime of <TT>initializer_list</TT> underlying array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1566">
     <td><a href="https://cplusplus.github.io/CWG/issues/1566.html">1566</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Should <TT>new std::initializer_list&lt;T&gt;</TT> be ill-formed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1567">
     <td><a href="https://cplusplus.github.io/CWG/issues/1567.html">1567</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>C++14</td>
     <td>Inheriting constructors and copy/move constructors</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1568">
     <td><a href="https://cplusplus.github.io/CWG/issues/1568.html">1568</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>dup</td>
     <td>Temporary lifetime extension with intervening cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1569">
     <td><a href="https://cplusplus.github.io/CWG/issues/1569.html">1569</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++14</td>
     <td>Deducing a function parameter pack before ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1570">
     <td><a href="https://cplusplus.github.io/CWG/issues/1570.html">1570</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++14</td>
     <td>Address of subobject as non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1571">
     <td><a href="https://cplusplus.github.io/CWG/issues/1571.html">1571</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD4</td>
     <td>cv-qualification for indirect reference binding via conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1572">
     <td><a href="https://cplusplus.github.io/CWG/issues/1572.html">1572</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD4</td>
     <td>Incorrect example for rvalue reference binding via conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1573">
     <td><a href="https://cplusplus.github.io/CWG/issues/1573.html">1573</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inherited constructor characteristics</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1574">
     <td><a href="https://cplusplus.github.io/CWG/issues/1574.html">1574</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>NAD</td>
     <td>Explicitly-defaulted <TT>constexpr</TT> functions in wrapper templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1575">
     <td><a href="https://cplusplus.github.io/CWG/issues/1575.html">1575</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>C++14</td>
     <td>Incorrect definition of &#8220;strict pointer safety&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1576">
     <td><a href="https://cplusplus.github.io/CWG/issues/1576.html">1576</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++14</td>
     <td>Discarded-value volatile xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1577">
     <td><a href="https://cplusplus.github.io/CWG/issues/1577.html">1577</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>NAD</td>
     <td>Unnecessary restrictions on partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1578">
     <td><a href="https://cplusplus.github.io/CWG/issues/1578.html">1578</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Value-initialization of aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1579">
     <td><a href="https://cplusplus.github.io/CWG/issues/1579.html">1579</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Return by converting move constructor</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr class="open" id="1580">
     <td><a href="https://cplusplus.github.io/CWG/issues/1580.html">1580</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>drafting</td>
     <td>Default arguments in explicit instantiations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1581">
     <td><a href="https://cplusplus.github.io/CWG/issues/1581.html">1581</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>When are <TT>constexpr</TT> member functions defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1582">
     <td><a href="https://cplusplus.github.io/CWG/issues/1582.html">1582</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>Template default arguments and deduction failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1583">
     <td><a href="https://cplusplus.github.io/CWG/issues/1583.html">1583</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++14</td>
     <td>Incorrect example of unspecified behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1584">
     <td><a href="https://cplusplus.github.io/CWG/issues/1584.html">1584</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>drafting</td>
     <td>Deducing function types from cv-qualified types</td>
     <td align="center">
@@ -9335,1836 +10877,2142 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1585">
     <td><a href="https://cplusplus.github.io/CWG/issues/1585.html">1585</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>NAD</td>
     <td>Value category of member access of rvalue reference member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1586">
     <td><a href="https://cplusplus.github.io/CWG/issues/1586.html">1586</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>NAD</td>
     <td>Naming a destructor via <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1587">
     <td><a href="https://cplusplus.github.io/CWG/issues/1587.html">1587</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td><TT>constexpr</TT> initialization and nested anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1588">
     <td><a href="https://cplusplus.github.io/CWG/issues/1588.html">1588</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Deducing cv-qualified <TT>auto</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1589">
     <td><a href="https://cplusplus.github.io/CWG/issues/1589.html">1589</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD4</td>
     <td>Ambiguous ranking of list-initialization sequences</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1590">
     <td><a href="https://cplusplus.github.io/CWG/issues/1590.html">1590</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Bypassing non-copy/move constructor copying</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1591">
     <td><a href="https://cplusplus.github.io/CWG/issues/1591.html">1591</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD4</td>
     <td>Deducing array bound and element type from initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1592">
     <td><a href="https://cplusplus.github.io/CWG/issues/1592.html">1592</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>C++14</td>
     <td>When do template parameters match?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1593">
     <td><a href="https://cplusplus.github.io/CWG/issues/1593.html">1593</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>&#8220;Parameter type&#8221; of special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1594">
     <td><a href="https://cplusplus.github.io/CWG/issues/1594.html">1594</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Lazy declaration of special members vs overload errors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1595">
     <td><a href="https://cplusplus.github.io/CWG/issues/1595.html">1595</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td>Constructors &#8220;involved in&#8221; subobject initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1596">
     <td><a href="https://cplusplus.github.io/CWG/issues/1596.html">1596</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD4</td>
     <td>Non-array objects as <TT>array[1]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1597">
     <td><a href="https://cplusplus.github.io/CWG/issues/1597.html">1597</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Misleading <TT>constexpr</TT> example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1598">
     <td><a href="https://cplusplus.github.io/CWG/issues/1598.html">1598</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>C++14</td>
     <td>Criterion for equality of pointers to members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1599">
     <td><a href="https://cplusplus.github.io/CWG/issues/1599.html">1599</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Lifetime of <TT>initializer_list</TT> underlying array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1600">
     <td><a href="https://cplusplus.github.io/CWG/issues/1600.html">1600</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD4</td>
     <td>Erroneous reference initialization in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1601">
     <td><a href="https://cplusplus.github.io/CWG/issues/1601.html">1601</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>C++14</td>
     <td>Promotion of enumeration with fixed underlying type</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr class="open" id="1602">
     <td><a href="https://cplusplus.github.io/CWG/issues/1602.html">1602</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>review</td>
     <td>Linkage of specialization vs linkage of template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1603">
     <td><a href="https://cplusplus.github.io/CWG/issues/1603.html">1603</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD4</td>
     <td>Errors resulting from giving unnamed namespaces internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1604">
     <td><a href="https://cplusplus.github.io/CWG/issues/1604.html">1604</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++14</td>
     <td>Double temporaries in reference initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1605">
     <td><a href="https://cplusplus.github.io/CWG/issues/1605.html">1605</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD3</td>
     <td>Misleading parenthetical comment for explicit destructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1606">
     <td><a href="https://cplusplus.github.io/CWG/issues/1606.html">1606</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>NAD</td>
     <td><TT>sizeof</TT> closure class</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1607">
     <td><a href="https://cplusplus.github.io/CWG/issues/1607.html">1607</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td>Lambdas in template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1608">
     <td><a href="https://cplusplus.github.io/CWG/issues/1608.html">1608</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++14</td>
     <td>Operator lookup in trailing return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1609">
     <td><a href="https://cplusplus.github.io/CWG/issues/1609.html">1609</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Default arguments and function parameter packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1610">
     <td><a href="https://cplusplus.github.io/CWG/issues/1610.html">1610</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>drafting</td>
     <td>Cv-qualification in deduction of reference to array</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1611">
     <td><a href="https://cplusplus.github.io/CWG/issues/1611.html">1611</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++14</td>
     <td>Deleted default constructor for abstract class</td>
     <td class="full" align="center">Duplicate of <a href="#1658">1658</a></td>
   </tr>
   <tr id="1612">
     <td><a href="https://cplusplus.github.io/CWG/issues/1612.html">1612</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Implicit lambda capture and anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1613">
     <td><a href="https://cplusplus.github.io/CWG/issues/1613.html">1613</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Constant expressions and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1614">
     <td><a href="https://cplusplus.github.io/CWG/issues/1614.html">1614</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Address of pure virtual function vs odr-use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1615">
     <td><a href="https://cplusplus.github.io/CWG/issues/1615.html">1615</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD4</td>
     <td>Alignment of types, variables, and members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1616">
     <td><a href="https://cplusplus.github.io/CWG/issues/1616.html">1616</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>CD6</td>
     <td>Disambiguation parsing and template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1617">
     <td><a href="https://cplusplus.github.io/CWG/issues/1617.html">1617</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>open</td>
     <td><TT>alignas</TT> and non-defining declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1618">
     <td><a href="https://cplusplus.github.io/CWG/issues/1618.html">1618</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++14</td>
     <td>Gratuitously-unsigned underlying enum type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1619">
     <td><a href="https://cplusplus.github.io/CWG/issues/1619.html">1619</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>open</td>
     <td>Definition of current instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1620">
     <td><a href="https://cplusplus.github.io/CWG/issues/1620.html">1620</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>open</td>
     <td>User-defined literals and extended integer types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1621">
     <td><a href="https://cplusplus.github.io/CWG/issues/1621.html">1621</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++20</td>
     <td>Member initializers in anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1622">
     <td><a href="https://cplusplus.github.io/CWG/issues/1622.html">1622</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Empty aggregate initializer for union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1623">
     <td><a href="https://cplusplus.github.io/CWG/issues/1623.html">1623</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>drafting</td>
     <td>Deleted default union constructor and member initializers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1624">
     <td><a href="https://cplusplus.github.io/CWG/issues/1624.html">1624</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Destruction of union members with member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1625">
     <td><a href="https://cplusplus.github.io/CWG/issues/1625.html">1625</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>open</td>
     <td>Adding spaces between tokens in stringizing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1626">
     <td><a href="https://cplusplus.github.io/CWG/issues/1626.html">1626</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>dup</td>
     <td><TT>constexpr</TT> member functions in <I>brace-or-equal-initializer</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1627">
     <td><a href="https://cplusplus.github.io/CWG/issues/1627.html">1627</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>NAD</td>
     <td>Agreement of dependent <TT>alignas</TT> specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1628">
     <td><a href="https://cplusplus.github.io/CWG/issues/1628.html">1628</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Deallocation function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1629">
     <td><a href="https://cplusplus.github.io/CWG/issues/1629.html">1629</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>C++14</td>
     <td>Can a closure class be a literal type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1630">
     <td><a href="https://cplusplus.github.io/CWG/issues/1630.html">1630</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Multiple default constructor templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1631">
     <td><a href="https://cplusplus.github.io/CWG/issues/1631.html">1631</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD4</td>
     <td>Incorrect overload resolution for single-element <I>initializer-list</I></td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1632">
     <td><a href="https://cplusplus.github.io/CWG/issues/1632.html">1632</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td>Lambda capture in member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1633">
     <td><a href="https://cplusplus.github.io/CWG/issues/1633.html">1633</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Copy-initialization in member initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1634">
     <td><a href="https://cplusplus.github.io/CWG/issues/1634.html">1634</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>open</td>
     <td>Temporary storage duration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1635">
     <td><a href="https://cplusplus.github.io/CWG/issues/1635.html">1635</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>How similar are template default arguments to function default arguments?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1636">
     <td><a href="https://cplusplus.github.io/CWG/issues/1636.html">1636</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD5</td>
     <td>Bits required for negative enumerator values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1637">
     <td><a href="https://cplusplus.github.io/CWG/issues/1637.html">1637</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Recursion in <TT>constexpr</TT> template default constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1638">
     <td><a href="https://cplusplus.github.io/CWG/issues/1638.html">1638</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Declaring an explicit specialization of a scoped enumeration</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1639">
     <td><a href="https://cplusplus.github.io/CWG/issues/1639.html">1639</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and pointer/pointer-to-member expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1640">
     <td><a href="https://cplusplus.github.io/CWG/issues/1640.html">1640</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD5</td>
     <td>Array of abstract instance of class template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1641">
     <td><a href="https://cplusplus.github.io/CWG/issues/1641.html">1641</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td>Assignment in member initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1642">
     <td><a href="https://cplusplus.github.io/CWG/issues/1642.html">1642</a></td>
+    <td>[<a href="https://wg21.link/expr.compound">expr.compound</a>]</td>
     <td>CD7</td>
     <td>Missing requirements for prvalue operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1643">
     <td><a href="https://cplusplus.github.io/CWG/issues/1643.html">1643</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Default arguments for template parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1644">
     <td><a href="https://cplusplus.github.io/CWG/issues/1644.html">1644</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>NAD</td>
     <td>Equivalent <I>exception-specification</I>s in function template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1645">
     <td><a href="https://cplusplus.github.io/CWG/issues/1645.html">1645</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Identical inheriting constructors via default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1646">
     <td><a href="https://cplusplus.github.io/CWG/issues/1646.html">1646</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD5</td>
     <td><I>decltype-specifier</I>s, abstract classes, and deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1647">
     <td><a href="https://cplusplus.github.io/CWG/issues/1647.html">1647</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>drafting</td>
     <td>Type agreement of non-type template arguments in partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1648">
     <td><a href="https://cplusplus.github.io/CWG/issues/1648.html">1648</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>C++14</td>
     <td><TT>thread_local</TT> vs block extern declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1649">
     <td><a href="https://cplusplus.github.io/CWG/issues/1649.html">1649</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++14</td>
     <td>Error in the syntax of <I>mem-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1650">
     <td><a href="https://cplusplus.github.io/CWG/issues/1650.html">1650</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Class prvalues in reference initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1651">
     <td><a href="https://cplusplus.github.io/CWG/issues/1651.html">1651</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Lifetime extension of temporary via reference to subobject</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1652">
     <td><a href="https://cplusplus.github.io/CWG/issues/1652.html">1652</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>CD4</td>
     <td>Object addresses in <TT>constexpr</TT> expressions</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="1653">
     <td><a href="https://cplusplus.github.io/CWG/issues/1653.html">1653</a></td>
+    <td>[<a href="https://wg21.link/expr.pre.incr">expr.pre.incr</a>]</td>
     <td>CD4</td>
     <td>Removing deprecated increment of <TT>bool</TT></td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr id="1654">
     <td><a href="https://cplusplus.github.io/CWG/issues/1654.html">1654</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>dup</td>
     <td>Literal types and <TT>constexpr</TT> defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1655">
     <td><a href="https://cplusplus.github.io/CWG/issues/1655.html">1655</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>open</td>
     <td>Line endings in raw string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1656">
     <td><a href="https://cplusplus.github.io/CWG/issues/1656.html">1656</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Encoding of numerically-escaped characters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1657">
     <td><a href="https://cplusplus.github.io/CWG/issues/1657.html">1657</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Attributes for namespaces and enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1658">
     <td><a href="https://cplusplus.github.io/CWG/issues/1658.html">1658</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++14</td>
     <td>Deleted default constructor for abstract class via destructor</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr class="open" id="1659">
     <td><a href="https://cplusplus.github.io/CWG/issues/1659.html">1659</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Initialization order of thread_local template static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1660">
     <td><a href="https://cplusplus.github.io/CWG/issues/1660.html">1660</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++14</td>
     <td><I>member-declaration</I> requirements and unnamed bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1661">
     <td><a href="https://cplusplus.github.io/CWG/issues/1661.html">1661</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Preservation of infinite loops</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1662">
     <td><a href="https://cplusplus.github.io/CWG/issues/1662.html">1662</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Capturing function parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1663">
     <td><a href="https://cplusplus.github.io/CWG/issues/1663.html">1663</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Capturing an empty pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1664">
     <td><a href="https://cplusplus.github.io/CWG/issues/1664.html">1664</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td>Argument-dependent lookup of lambdas used in default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1665">
     <td><a href="https://cplusplus.github.io/CWG/issues/1665.html">1665</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Declaration matching in explicit instantiations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1666">
     <td><a href="https://cplusplus.github.io/CWG/issues/1666.html">1666</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++14</td>
     <td>Address constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1667">
     <td><a href="https://cplusplus.github.io/CWG/issues/1667.html">1667</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>NAD</td>
     <td>Function exiting via exception called by destructor during unwinding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1668">
     <td><a href="https://cplusplus.github.io/CWG/issues/1668.html">1668</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>drafting</td>
     <td>Parameter type determination still not clear enough</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1669">
     <td><a href="https://cplusplus.github.io/CWG/issues/1669.html">1669</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>C++14</td>
     <td><TT>auto</TT> return type for <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1670">
     <td><a href="https://cplusplus.github.io/CWG/issues/1670.html">1670</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>review</td>
     <td><TT>auto</TT> as <I>conversion-type-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1671">
     <td><a href="https://cplusplus.github.io/CWG/issues/1671.html">1671</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Unclear rules for deduction with cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1672">
     <td><a href="https://cplusplus.github.io/CWG/issues/1672.html">1672</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Layout compatibility with multiple empty bases</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1673">
     <td><a href="https://cplusplus.github.io/CWG/issues/1673.html">1673</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>C++14</td>
     <td>Clarifying overload resolution for the second step of copy-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1674">
     <td><a href="https://cplusplus.github.io/CWG/issues/1674.html">1674</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++14</td>
     <td>Return type deduction for address of function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1675">
     <td><a href="https://cplusplus.github.io/CWG/issues/1675.html">1675</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>NAD</td>
     <td>Size limit for automatic array object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1676">
     <td><a href="https://cplusplus.github.io/CWG/issues/1676.html">1676</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>drafting</td>
     <td><TT>auto</TT> return type for allocation and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1677">
     <td><a href="https://cplusplus.github.io/CWG/issues/1677.html">1677</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++17</td>
     <td>Constant initialization via aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1678">
     <td><a href="https://cplusplus.github.io/CWG/issues/1678.html">1678</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>NAD</td>
     <td>Naming the type of an array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1679">
     <td><a href="https://cplusplus.github.io/CWG/issues/1679.html">1679</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>NAD</td>
     <td>Range-based <TT>for</TT> and array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1680">
     <td><a href="https://cplusplus.github.io/CWG/issues/1680.html">1680</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>drafting</td>
     <td>Including <TT>&lt;initializer_list&gt;</TT> for range-based <TT>for</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1681">
     <td><a href="https://cplusplus.github.io/CWG/issues/1681.html">1681</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td><I>init-capture</I>s and nested lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1682">
     <td><a href="https://cplusplus.github.io/CWG/issues/1682.html">1682</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>open</td>
     <td>Overly-restrictive rules on function templates as allocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1683">
     <td><a href="https://cplusplus.github.io/CWG/issues/1683.html">1683</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Incorrect example after <TT>constexpr</TT> changes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1684">
     <td><a href="https://cplusplus.github.io/CWG/issues/1684.html">1684</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td>Static <TT>constexpr</TT> member functions for non-literal classes</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="1685">
     <td><a href="https://cplusplus.github.io/CWG/issues/1685.html">1685</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>NAD</td>
     <td>Value category of <TT>noexcept</TT> expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1686">
     <td><a href="https://cplusplus.github.io/CWG/issues/1686.html">1686</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD4</td>
     <td>Which variables are &#8220;explicitly declared <TT>const</TT>?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1687">
     <td><a href="https://cplusplus.github.io/CWG/issues/1687.html">1687</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++14</td>
     <td>Conversions of operands of built-in operators</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1688">
     <td><a href="https://cplusplus.github.io/CWG/issues/1688.html">1688</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Volatile <TT>constexpr</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1689">
     <td><a href="https://cplusplus.github.io/CWG/issues/1689.html">1689</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++14</td>
     <td>Syntactic nonterminal for operand of <TT>alignas</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1690">
     <td><a href="https://cplusplus.github.io/CWG/issues/1690.html">1690</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Associated namespace for local type</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1691">
     <td><a href="https://cplusplus.github.io/CWG/issues/1691.html">1691</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Argument-dependent lookup and opaque enumerations</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1692">
     <td><a href="https://cplusplus.github.io/CWG/issues/1692.html">1692</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Associated namespaces of doubly-nested classes</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1693">
     <td><a href="https://cplusplus.github.io/CWG/issues/1693.html">1693</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++14</td>
     <td>Superfluous semicolons in class definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1694">
     <td><a href="https://cplusplus.github.io/CWG/issues/1694.html">1694</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Restriction on reference to temporary as a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1695">
     <td><a href="https://cplusplus.github.io/CWG/issues/1695.html">1695</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Lifetime extension via <I>init-capture</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1696">
     <td><a href="https://cplusplus.github.io/CWG/issues/1696.html">1696</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Temporary lifetime and non-static data member initializers</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1697">
     <td><a href="https://cplusplus.github.io/CWG/issues/1697.html">1697</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Lifetime extension and copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1699">
     <td><a href="https://cplusplus.github.io/CWG/issues/1699.html">1699</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>open</td>
     <td>Does befriending a class befriend its friends?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1700">
     <td><a href="https://cplusplus.github.io/CWG/issues/1700.html">1700</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Does the special rvalue-reference deduction apply to alias templates?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1701">
     <td><a href="https://cplusplus.github.io/CWG/issues/1701.html">1701</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>open</td>
     <td>Array vs sequence in object representation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1702">
     <td><a href="https://cplusplus.github.io/CWG/issues/1702.html">1702</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>drafting</td>
     <td>Rephrasing the definition of &#8220;anonymous union&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1703">
     <td><a href="https://cplusplus.github.io/CWG/issues/1703.html">1703</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Language linkage of names of functions with internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1704">
     <td><a href="https://cplusplus.github.io/CWG/issues/1704.html">1704</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Type checking in explicit instantiation of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1705">
     <td><a href="https://cplusplus.github.io/CWG/issues/1705.html">1705</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD4</td>
     <td>Unclear specification of &#8220;more specialized&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1706">
     <td><a href="https://cplusplus.github.io/CWG/issues/1706.html">1706</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>drafting</td>
     <td><TT>alignas</TT> pack expansion syntax</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1707">
     <td><a href="https://cplusplus.github.io/CWG/issues/1707.html">1707</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>C++14</td>
     <td><TT>template</TT> in <I>elaborated-type-specifier</I> without <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1708">
     <td><a href="https://cplusplus.github.io/CWG/issues/1708.html">1708</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD4</td>
     <td>overly-strict requirements for names with C language linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1709">
     <td><a href="https://cplusplus.github.io/CWG/issues/1709.html">1709</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>open</td>
     <td>Stringizing raw string literals containing newline</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1710">
     <td><a href="https://cplusplus.github.io/CWG/issues/1710.html">1710</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>C++17</td>
     <td>Missing <TT>template</TT> keyword in <I>class-or-decltype</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1711">
     <td><a href="https://cplusplus.github.io/CWG/issues/1711.html">1711</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD6</td>
     <td>Missing specification of variable template partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1712">
     <td><a href="https://cplusplus.github.io/CWG/issues/1712.html">1712</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td><TT>constexpr</TT> variable template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1713">
     <td><a href="https://cplusplus.github.io/CWG/issues/1713.html">1713</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>dup</td>
     <td>Linkage of variable template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1714">
     <td><a href="https://cplusplus.github.io/CWG/issues/1714.html">1714</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>NAD</td>
     <td>odr-use of <TT>this</TT> from a local class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1715">
     <td><a href="https://cplusplus.github.io/CWG/issues/1715.html">1715</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Access and inherited constructor templates</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1716">
     <td><a href="https://cplusplus.github.io/CWG/issues/1716.html">1716</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>C++14</td>
     <td>When are default arguments evaluated?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1717">
     <td><a href="https://cplusplus.github.io/CWG/issues/1717.html">1717</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>C++14</td>
     <td>Missing specification of type of binary literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1718">
     <td><a href="https://cplusplus.github.io/CWG/issues/1718.html">1718</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>open</td>
     <td>Macro invocation spanning end-of-file</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1719">
     <td><a href="https://cplusplus.github.io/CWG/issues/1719.html">1719</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Layout compatibility and cv-qualification revisited</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="1720">
     <td><a href="https://cplusplus.github.io/CWG/issues/1720.html">1720</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>NAD</td>
     <td>Macro invocation in <TT>#include</TT> directive</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1721">
     <td><a href="https://cplusplus.github.io/CWG/issues/1721.html">1721</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>review</td>
     <td>Diagnosing ODR violations for static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1722">
     <td><a href="https://cplusplus.github.io/CWG/issues/1722.html">1722</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Should lambda to function pointer conversion function be <TT>noexcept</TT>?</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr class="open" id="1723">
     <td><a href="https://cplusplus.github.io/CWG/issues/1723.html">1723</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td>Multicharacter user-defined character literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1724">
     <td><a href="https://cplusplus.github.io/CWG/issues/1724.html">1724</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Unclear rules for deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1725">
     <td><a href="https://cplusplus.github.io/CWG/issues/1725.html">1725</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Trailing return type with nested function declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1726">
     <td><a href="https://cplusplus.github.io/CWG/issues/1726.html">1726</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD6</td>
     <td>Declarator operators and conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1727">
     <td><a href="https://cplusplus.github.io/CWG/issues/1727.html">1727</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Type of a specialization of a variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1728">
     <td><a href="https://cplusplus.github.io/CWG/issues/1728.html">1728</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Type of an explicit instantiation of a variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1729">
     <td><a href="https://cplusplus.github.io/CWG/issues/1729.html">1729</a></td>
+    <td>[<a href="https://wg21.link/temp.decls">temp.decls</a>]</td>
     <td>CD6</td>
     <td>Matching declarations and definitions of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1730">
     <td><a href="https://cplusplus.github.io/CWG/issues/1730.html">1730</a></td>
+    <td>[<a href="https://wg21.link/temp.decls">temp.decls</a>]</td>
     <td>drafting</td>
     <td>Can a variable template have an unnamed type?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1731">
     <td><a href="https://cplusplus.github.io/CWG/issues/1731.html">1731</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td><TT>is_trivially_</TT><I>X</I> and definitions of special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1732">
     <td><a href="https://cplusplus.github.io/CWG/issues/1732.html">1732</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>C++14</td>
     <td>Defining types in <I>condition</I>s and range-based <TT>for</TT> statements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1733">
     <td><a href="https://cplusplus.github.io/CWG/issues/1733.html">1733</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD6</td>
     <td>Return type and value for <TT>operator=</TT> with <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1734">
     <td><a href="https://cplusplus.github.io/CWG/issues/1734.html">1734</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Nontrivial deleted copy functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="1735">
     <td><a href="https://cplusplus.github.io/CWG/issues/1735.html">1735</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td>Out-of-range literals in <I>user-defined-literal</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1736">
     <td><a href="https://cplusplus.github.io/CWG/issues/1736.html">1736</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inheriting constructor templates in a local class</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1737">
     <td><a href="https://cplusplus.github.io/CWG/issues/1737.html">1737</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++14</td>
     <td>Type dependence of call to a member of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1738">
     <td><a href="https://cplusplus.github.io/CWG/issues/1738.html">1738</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>C++14</td>
     <td>Explicit instantiation/specialization of inheriting constructor templates</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P0136R1">P0136R1</a></td>
   </tr>
   <tr id="1739">
     <td><a href="https://cplusplus.github.io/CWG/issues/1739.html">1739</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++14</td>
     <td>Conversion of floating point to enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1740">
     <td><a href="https://cplusplus.github.io/CWG/issues/1740.html">1740</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++14</td>
     <td>Disambiguation of <TT>noexcept</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1741">
     <td><a href="https://cplusplus.github.io/CWG/issues/1741.html">1741</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++14</td>
     <td>odr-use of class object in lvalue-to-rvalue conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1742">
     <td><a href="https://cplusplus.github.io/CWG/issues/1742.html">1742</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD5</td>
     <td><I>using-declaration</I>s and scoped enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1743">
     <td><a href="https://cplusplus.github.io/CWG/issues/1743.html">1743</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td><I>init-capture</I>s in nested lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1744">
     <td><a href="https://cplusplus.github.io/CWG/issues/1744.html">1744</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD4</td>
     <td>Unordered initialization for variable template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1745">
     <td><a href="https://cplusplus.github.io/CWG/issues/1745.html">1745</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>thread_local constexpr</TT> variable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1746">
     <td><a href="https://cplusplus.github.io/CWG/issues/1746.html">1746</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++14</td>
     <td>Are volatile scalar types trivially copyable?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1747">
     <td><a href="https://cplusplus.github.io/CWG/issues/1747.html">1747</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++14</td>
     <td>Constant initialization of reference to function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1748">
     <td><a href="https://cplusplus.github.io/CWG/issues/1748.html">1748</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Placement new with a null pointer</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1749">
     <td><a href="https://cplusplus.github.io/CWG/issues/1749.html">1749</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
     <td>Confusing definition for constant initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1750">
     <td><a href="https://cplusplus.github.io/CWG/issues/1750.html">1750</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD4</td>
     <td>&#8220;Argument&#8221; vs &#8220;parameter&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1751">
     <td><a href="https://cplusplus.github.io/CWG/issues/1751.html">1751</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Non-trivial operations vs non-trivial initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1752">
     <td><a href="https://cplusplus.github.io/CWG/issues/1752.html">1752</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD4</td>
     <td>Right-recursion in <I>mem-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1753">
     <td><a href="https://cplusplus.github.io/CWG/issues/1753.html">1753</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD4</td>
     <td><I>decltype-specifier</I> in <I>nested-name-specifier</I> of destructor</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1754">
     <td><a href="https://cplusplus.github.io/CWG/issues/1754.html">1754</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>NAD</td>
     <td>Declaration of partial specialization of static data member template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1755">
     <td><a href="https://cplusplus.github.io/CWG/issues/1755.html">1755</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.member">temp.spec.partial.member</a>]</td>
     <td>drafting</td>
     <td>Out-of-class partial specializations of member templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1756">
     <td><a href="https://cplusplus.github.io/CWG/issues/1756.html">1756</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Direct-list-initialization of a non-class object</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1757">
     <td><a href="https://cplusplus.github.io/CWG/issues/1757.html">1757</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Const integral subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1758">
     <td><a href="https://cplusplus.github.io/CWG/issues/1758.html">1758</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>CD4</td>
     <td>Explicit conversion in copy/move list initialization</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1759">
     <td><a href="https://cplusplus.github.io/CWG/issues/1759.html">1759</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>C++14</td>
     <td>UTF-8 code units in plain <TT>char</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1760">
     <td><a href="https://cplusplus.github.io/CWG/issues/1760.html">1760</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Access of member corresponding to <I>init-capture</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1761">
     <td><a href="https://cplusplus.github.io/CWG/issues/1761.html">1761</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Runtime check on size of automatic array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1762">
     <td><a href="https://cplusplus.github.io/CWG/issues/1762.html">1762</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>C++14</td>
     <td>Reserved identifier used in <I>literal-operator-id</I> example</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr class="open" id="1763">
     <td><a href="https://cplusplus.github.io/CWG/issues/1763.html">1763</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Length mismatch in template type deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1764">
     <td><a href="https://cplusplus.github.io/CWG/issues/1764.html">1764</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>C++14</td>
     <td>Hiding of function from using-declaration by signature</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1765">
     <td><a href="https://cplusplus.github.io/CWG/issues/1765.html">1765</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++14</td>
     <td>Overflow of enumeration used as enumerator value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1766">
     <td><a href="https://cplusplus.github.io/CWG/issues/1766.html">1766</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Values outside the range of the values of an enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1767">
     <td><a href="https://cplusplus.github.io/CWG/issues/1767.html">1767</a></td>
+    <td>[<a href="https://wg21.link/stmt.switch">stmt.switch</a>]</td>
     <td>C++14</td>
     <td>Scoped enumeration in a <TT>switch</TT> statement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1768">
     <td><a href="https://cplusplus.github.io/CWG/issues/1768.html">1768</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Zero-element array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1769">
     <td><a href="https://cplusplus.github.io/CWG/issues/1769.html">1769</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++14</td>
     <td>Catching a base class of the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1770">
     <td><a href="https://cplusplus.github.io/CWG/issues/1770.html">1770</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++14</td>
     <td>Type matching of non-type template parameters and arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1771">
     <td><a href="https://cplusplus.github.io/CWG/issues/1771.html">1771</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD6</td>
     <td>Restricted lookup in <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1772">
     <td><a href="https://cplusplus.github.io/CWG/issues/1772.html">1772</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td><TT>__func__</TT> in a lambda body</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1773">
     <td><a href="https://cplusplus.github.io/CWG/issues/1773.html">1773</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>C++14</td>
     <td>Out-of-lifetime lvalue-to-rvalue conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1774">
     <td><a href="https://cplusplus.github.io/CWG/issues/1774.html">1774</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Discrepancy between subobject destruction and stack unwinding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1775">
     <td><a href="https://cplusplus.github.io/CWG/issues/1775.html">1775</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++14</td>
     <td>Undefined behavior of line splice in raw string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1776">
     <td><a href="https://cplusplus.github.io/CWG/issues/1776.html">1776</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Replacement of class objects containing reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1777">
     <td><a href="https://cplusplus.github.io/CWG/issues/1777.html">1777</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Empty pack expansion in <I>dynamic-exception-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1778">
     <td><a href="https://cplusplus.github.io/CWG/issues/1778.html">1778</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++14</td>
     <td><I>exception-specification</I> in explicitly-defaulted functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1779">
     <td><a href="https://cplusplus.github.io/CWG/issues/1779.html">1779</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD4</td>
     <td>Type dependency of <TT>__func__</TT></td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1780">
     <td><a href="https://cplusplus.github.io/CWG/issues/1780.html">1780</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Explicit instantiation/specialization of generic lambda <TT>operator()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1781">
     <td><a href="https://cplusplus.github.io/CWG/issues/1781.html">1781</a></td>
+    <td>[<a href="https://wg21.link/over.match.conv">over.match.conv</a>]</td>
     <td>CD5</td>
     <td>Converting from <TT>nullptr_t</TT> to <TT>bool</TT> in overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1782">
     <td><a href="https://cplusplus.github.io/CWG/issues/1782.html">1782</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Form of initialization for <TT>nullptr_t</TT> to <TT>bool</TT> conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1783">
     <td><a href="https://cplusplus.github.io/CWG/issues/1783.html">1783</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>NAD</td>
     <td>Why are virtual destructors non-trivial?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1784">
     <td><a href="https://cplusplus.github.io/CWG/issues/1784.html">1784</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>C++17</td>
     <td>Concurrent execution during static local initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1785">
     <td><a href="https://cplusplus.github.io/CWG/issues/1785.html">1785</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Conflicting diagnostic requirements for template definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1786">
     <td><a href="https://cplusplus.github.io/CWG/issues/1786.html">1786</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++14</td>
     <td>Effect of merging allocations on memory leakage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1787">
     <td><a href="https://cplusplus.github.io/CWG/issues/1787.html">1787</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>C++14</td>
     <td>Uninitialized <TT>unsigned char</TT> values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1788">
     <td><a href="https://cplusplus.github.io/CWG/issues/1788.html">1788</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD4</td>
     <td>Sized deallocation of array of non-class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1789">
     <td><a href="https://cplusplus.github.io/CWG/issues/1789.html">1789</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Array reference vs array decay in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1790">
     <td><a href="https://cplusplus.github.io/CWG/issues/1790.html">1790</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Ellipsis following function parameter pack</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1791">
     <td><a href="https://cplusplus.github.io/CWG/issues/1791.html">1791</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD4</td>
     <td>Incorrect restrictions on <I>cv-qualifier-seq</I> and <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1792">
     <td><a href="https://cplusplus.github.io/CWG/issues/1792.html">1792</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Incorrect example of explicit specialization of member enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1793">
     <td><a href="https://cplusplus.github.io/CWG/issues/1793.html">1793</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><TT>thread_local</TT> in explicit specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1794">
     <td><a href="https://cplusplus.github.io/CWG/issues/1794.html">1794</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td><TT>template</TT> keyword and alias templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1795">
     <td><a href="https://cplusplus.github.io/CWG/issues/1795.html">1795</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Disambiguating <I>original-namespace-definition</I> and <I>extension-namespace-definition</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1796">
     <td><a href="https://cplusplus.github.io/CWG/issues/1796.html">1796</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD4</td>
     <td>Is all-bits-zero for null characters a meaningful requirement?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1797">
     <td><a href="https://cplusplus.github.io/CWG/issues/1797.html">1797</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD4</td>
     <td>Are all bit patterns of <TT>unsigned char</TT> distinct numbers?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1798">
     <td><a href="https://cplusplus.github.io/CWG/issues/1798.html">1798</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td><I>exception-specification</I>s of template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1799">
     <td><a href="https://cplusplus.github.io/CWG/issues/1799.html">1799</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><TT>mutable</TT> and non-explicit const qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1800">
     <td><a href="https://cplusplus.github.io/CWG/issues/1800.html">1800</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD4</td>
     <td>Pointer to member of nested anonymous union</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1801">
     <td><a href="https://cplusplus.github.io/CWG/issues/1801.html">1801</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD4</td>
     <td>Kind of expression referring to member of anonymous union</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="1802">
     <td><a href="https://cplusplus.github.io/CWG/issues/1802.html">1802</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD4</td>
     <td><TT>char16_t</TT> string literals and surrogate pairs</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1803">
     <td><a href="https://cplusplus.github.io/CWG/issues/1803.html">1803</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td><I>opaque-enum-declaration</I> as <I>member-declaration</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1804">
     <td><a href="https://cplusplus.github.io/CWG/issues/1804.html">1804</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD4</td>
     <td>Partial specialization and friendship</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1805">
     <td><a href="https://cplusplus.github.io/CWG/issues/1805.html">1805</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Conversions of array operands in <I>conditional-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1806">
     <td><a href="https://cplusplus.github.io/CWG/issues/1806.html">1806</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD4</td>
     <td>Virtual bases and move-assignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1807">
     <td><a href="https://cplusplus.github.io/CWG/issues/1807.html">1807</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Order of destruction of array elements after an exception</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr class="open" id="1808">
     <td><a href="https://cplusplus.github.io/CWG/issues/1808.html">1808</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>drafting</td>
     <td>Constructor templates vs default constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1809">
     <td><a href="https://cplusplus.github.io/CWG/issues/1809.html">1809</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD4</td>
     <td>Narrowing and template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1810">
     <td><a href="https://cplusplus.github.io/CWG/issues/1810.html">1810</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>CD4</td>
     <td>Invalid <I>ud-suffix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1811">
     <td><a href="https://cplusplus.github.io/CWG/issues/1811.html">1811</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Lookup of deallocation function in a virtual destructor definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1812">
     <td><a href="https://cplusplus.github.io/CWG/issues/1812.html">1812</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td>Omission of <TT>template</TT> in a <I>typename-specifier</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1813">
     <td><a href="https://cplusplus.github.io/CWG/issues/1813.html">1813</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Direct vs indirect bases in standard-layout classes</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1814">
     <td><a href="https://cplusplus.github.io/CWG/issues/1814.html">1814</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD4</td>
     <td>Default arguments in <I>lambda-expression</I>s</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1815">
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD4</td>
     <td>Lifetime extension in aggregate initialization</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="1816">
     <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td>
+    <td>[<a href="https://wg21.link/conv.integral">conv.integral</a>]</td>
     <td>CD4</td>
     <td>Unclear specification of bit-field values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1817">
     <td><a href="https://cplusplus.github.io/CWG/issues/1817.html">1817</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>open</td>
     <td>Linkage specifications and nested scopes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1818">
     <td><a href="https://cplusplus.github.io/CWG/issues/1818.html">1818</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>Visibility and inherited language linkage</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1819">
     <td><a href="https://cplusplus.github.io/CWG/issues/1819.html">1819</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Acceptable scopes for definition of partial specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1820">
     <td><a href="https://cplusplus.github.io/CWG/issues/1820.html">1820</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td>Qualified typedef names</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1821">
     <td><a href="https://cplusplus.github.io/CWG/issues/1821.html">1821</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD6</td>
     <td>Qualified redeclarations in a class <I>member-specification</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1822">
     <td><a href="https://cplusplus.github.io/CWG/issues/1822.html">1822</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD6</td>
     <td>Lookup of parameter names in <I>lambda-expression</I>s</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1823">
     <td><a href="https://cplusplus.github.io/CWG/issues/1823.html">1823</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD4</td>
     <td>String literal uniqueness in inline functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1824">
     <td><a href="https://cplusplus.github.io/CWG/issues/1824.html">1824</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD4</td>
     <td>Completeness of return type vs point of instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1825">
     <td><a href="https://cplusplus.github.io/CWG/issues/1825.html">1825</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>C++17</td>
     <td>Partial ordering between variadic and non-variadic function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1826">
     <td><a href="https://cplusplus.github.io/CWG/issues/1826.html">1826</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>const</TT> floating-point in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1827">
     <td><a href="https://cplusplus.github.io/CWG/issues/1827.html">1827</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>drafting</td>
     <td>Reference binding with ambiguous conversions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1828">
     <td><a href="https://cplusplus.github.io/CWG/issues/1828.html">1828</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD6</td>
     <td><I>nested-name-specifier</I> ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1829">
     <td><a href="https://cplusplus.github.io/CWG/issues/1829.html">1829</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Dependent unnamed types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1830">
     <td><a href="https://cplusplus.github.io/CWG/issues/1830.html">1830</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD4</td>
     <td>Repeated specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1831">
     <td><a href="https://cplusplus.github.io/CWG/issues/1831.html">1831</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Explicitly vs implicitly deleted move constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1832">
     <td><a href="https://cplusplus.github.io/CWG/issues/1832.html">1832</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD4</td>
     <td>Casting to incomplete enumeration</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1833">
     <td><a href="https://cplusplus.github.io/CWG/issues/1833.html">1833</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td><TT>friend</TT> declarations naming implicitly-declared member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1834">
     <td><a href="https://cplusplus.github.io/CWG/issues/1834.html">1834</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD4</td>
     <td>Constant initialization binding a reference to an xvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1835">
     <td><a href="https://cplusplus.github.io/CWG/issues/1835.html">1835</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Dependent member lookup before <TT>&lt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1836">
     <td><a href="https://cplusplus.github.io/CWG/issues/1836.html">1836</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD5</td>
     <td>Use of class type being defined in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1837">
     <td><a href="https://cplusplus.github.io/CWG/issues/1837.html">1837</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD6</td>
     <td>Use of <TT>this</TT> in <TT>friend</TT> and local class declarations</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1838">
     <td><a href="https://cplusplus.github.io/CWG/issues/1838.html">1838</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD4</td>
     <td>Definition via <I>unqualified-id</I> and <I>using-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1839">
     <td><a href="https://cplusplus.github.io/CWG/issues/1839.html">1839</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Lookup of block-scope <TT>extern</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1840">
     <td><a href="https://cplusplus.github.io/CWG/issues/1840.html">1840</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>drafting</td>
     <td>Non-deleted explicit specialization of deleted function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1841">
     <td><a href="https://cplusplus.github.io/CWG/issues/1841.html">1841</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>CD6</td>
     <td><TT>&lt;</TT> following template injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1842">
     <td><a href="https://cplusplus.github.io/CWG/issues/1842.html">1842</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>open</td>
     <td>Unevaluated operands and &#8220;carries a dependency&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1843">
     <td><a href="https://cplusplus.github.io/CWG/issues/1843.html">1843</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Bit-field in conditional operator with <TT>throw</TT> operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1844">
     <td><a href="https://cplusplus.github.io/CWG/issues/1844.html">1844</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Defining &#8220;immediate context&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1845">
     <td><a href="https://cplusplus.github.io/CWG/issues/1845.html">1845</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>review</td>
     <td>Point of instantiation of a variable template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1846">
     <td><a href="https://cplusplus.github.io/CWG/issues/1846.html">1846</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD4</td>
     <td>Declaring explicitly-defaulted implicitly-deleted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1847">
     <td><a href="https://cplusplus.github.io/CWG/issues/1847.html">1847</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD4</td>
     <td>Clarifying compatibility during partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1848">
     <td><a href="https://cplusplus.github.io/CWG/issues/1848.html">1848</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Parenthesized constructor and destructor declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1849">
     <td><a href="https://cplusplus.github.io/CWG/issues/1849.html">1849</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD6</td>
     <td>Variable templates and the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1850">
     <td><a href="https://cplusplus.github.io/CWG/issues/1850.html">1850</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD4</td>
     <td>Differences between definition context and point of instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1851">
     <td><a href="https://cplusplus.github.io/CWG/issues/1851.html">1851</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1852">
     <td><a href="https://cplusplus.github.io/CWG/issues/1852.html">1852</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD4</td>
     <td>Wording issues regarding <TT>decltype(auto)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1853">
     <td><a href="https://cplusplus.github.io/CWG/issues/1853.html">1853</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>dup</td>
     <td>Defining &#8220;allocated storage&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1854">
     <td><a href="https://cplusplus.github.io/CWG/issues/1854.html">1854</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>drafting</td>
     <td>Disallowing use of implicitly-deleted functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1855">
     <td><a href="https://cplusplus.github.io/CWG/issues/1855.html">1855</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>dup</td>
     <td>Out-of-lifetime access to nonstatic data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1856">
     <td><a href="https://cplusplus.github.io/CWG/issues/1856.html">1856</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Indirect nested classes of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1857">
     <td><a href="https://cplusplus.github.io/CWG/issues/1857.html">1857</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD5</td>
     <td>Additional questions about bits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1858">
     <td><a href="https://cplusplus.github.io/CWG/issues/1858.html">1858</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>CD4</td>
     <td>Comparing pointers to union members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1859">
     <td><a href="https://cplusplus.github.io/CWG/issues/1859.html">1859</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD5</td>
     <td>UTF-16 in <TT>char16_t</TT> string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1860">
     <td><a href="https://cplusplus.github.io/CWG/issues/1860.html">1860</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>C++17</td>
     <td>What is a &#8220;direct member?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1861">
     <td><a href="https://cplusplus.github.io/CWG/issues/1861.html">1861</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD4</td>
     <td>Values of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1862">
     <td><a href="https://cplusplus.github.io/CWG/issues/1862.html">1862</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Determining &#8220;corresponding members&#8221; for friendship</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1863">
     <td><a href="https://cplusplus.github.io/CWG/issues/1863.html">1863</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD4</td>
     <td>Requirements on thrown object type to support <TT>std::current_exception()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1864">
     <td><a href="https://cplusplus.github.io/CWG/issues/1864.html">1864</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>List-initialization of array objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1865">
     <td><a href="https://cplusplus.github.io/CWG/issues/1865.html">1865</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD4</td>
     <td>Pointer arithmetic and multi-level qualification conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1866">
     <td><a href="https://cplusplus.github.io/CWG/issues/1866.html">1866</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Initializing variant members with non-trivial destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1867">
     <td><a href="https://cplusplus.github.io/CWG/issues/1867.html">1867</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Function/expression ambiguity with qualified parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1868">
     <td><a href="https://cplusplus.github.io/CWG/issues/1868.html">1868</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>open</td>
     <td>Meaning of &#8220;placeholder type&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1869">
     <td><a href="https://cplusplus.github.io/CWG/issues/1869.html">1869</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td><TT>thread_local</TT> vs <I>linkage-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1870">
     <td><a href="https://cplusplus.github.io/CWG/issues/1870.html">1870</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>CD4</td>
     <td>Contradictory wording about definitions vs explicit specialization/instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1871">
     <td><a href="https://cplusplus.github.io/CWG/issues/1871.html">1871</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Non-identifier characters in <I>ud-suffix</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1872">
     <td><a href="https://cplusplus.github.io/CWG/issues/1872.html">1872</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td>Instantiations of <TT>constexpr</TT> templates that cannot appear in constant expressions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1873">
     <td><a href="https://cplusplus.github.io/CWG/issues/1873.html">1873</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD4</td>
     <td>Protected member access from derived class friends</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1874">
     <td><a href="https://cplusplus.github.io/CWG/issues/1874.html">1874</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD4</td>
     <td>Type vs non-type template parameters with <TT>class</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1875">
     <td><a href="https://cplusplus.github.io/CWG/issues/1875.html">1875</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD4</td>
     <td>Reordering declarations in class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1876">
     <td><a href="https://cplusplus.github.io/CWG/issues/1876.html">1876</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Preventing explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1877">
     <td><a href="https://cplusplus.github.io/CWG/issues/1877.html">1877</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td>Return type deduction from <TT>return</TT> with no operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1878">
     <td><a href="https://cplusplus.github.io/CWG/issues/1878.html">1878</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>operator auto</TT> template</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="1879">
     <td><a href="https://cplusplus.github.io/CWG/issues/1879.html">1879</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>NAD</td>
     <td>Inadequate definition of alignment requirement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1880">
     <td><a href="https://cplusplus.github.io/CWG/issues/1880.html">1880</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>When are parameter objects destroyed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1881">
     <td><a href="https://cplusplus.github.io/CWG/issues/1881.html">1881</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Standard-layout classes and unnamed bit-fields</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1882">
     <td><a href="https://cplusplus.github.io/CWG/issues/1882.html">1882</a></td>
+    <td>[<a href="https://wg21.link/global.names">global.names</a>]</td>
     <td>CD4</td>
     <td>Reserved names without library use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1883">
     <td><a href="https://cplusplus.github.io/CWG/issues/1883.html">1883</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>review</td>
     <td>Protected access to constructors in <I>mem-initializer</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1884">
     <td><a href="https://cplusplus.github.io/CWG/issues/1884.html">1884</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Unclear requirements for same-named external-linkage entities</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1885">
     <td><a href="https://cplusplus.github.io/CWG/issues/1885.html">1885</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>Return value of a function is underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1886">
     <td><a href="https://cplusplus.github.io/CWG/issues/1886.html">1886</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD4</td>
     <td>Language linkage for <TT>main()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1887">
     <td><a href="https://cplusplus.github.io/CWG/issues/1887.html">1887</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD4</td>
     <td>Problems with <TT>::</TT> as <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1888">
     <td><a href="https://cplusplus.github.io/CWG/issues/1888.html">1888</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD4</td>
     <td>Implicitly-declared default constructors and <TT>explicit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1889">
     <td><a href="https://cplusplus.github.io/CWG/issues/1889.html">1889</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma">cpp.pragma</a>]</td>
     <td>open</td>
     <td>Unclear effect of <TT>#pragma</TT> on conformance</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1890">
     <td><a href="https://cplusplus.github.io/CWG/issues/1890.html">1890</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>drafting</td>
     <td>Member type depending on definition of member function</td>
     <td align="center">
@@ -11175,2670 +13023,3115 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1891">
     <td><a href="https://cplusplus.github.io/CWG/issues/1891.html">1891</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Move constructor/assignment for closure class</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1892">
     <td><a href="https://cplusplus.github.io/CWG/issues/1892.html">1892</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td>Use of <TT>auto</TT> in function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1893">
     <td><a href="https://cplusplus.github.io/CWG/issues/1893.html">1893</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td>Function-style cast with <I>braced-init-list</I>s and empty pack expansions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1894">
     <td><a href="https://cplusplus.github.io/CWG/issues/1894.html">1894</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td><I>typedef-name</I>s and <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="1895">
     <td><a href="https://cplusplus.github.io/CWG/issues/1895.html">1895</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Deleted conversions in conditional operator operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1896">
     <td><a href="https://cplusplus.github.io/CWG/issues/1896.html">1896</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD6</td>
     <td>Repeated alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1897">
     <td><a href="https://cplusplus.github.io/CWG/issues/1897.html">1897</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>review</td>
     <td>ODR vs alternative tokens</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1898">
     <td><a href="https://cplusplus.github.io/CWG/issues/1898.html">1898</a></td>
+    <td>[<a href="https://wg21.link/over.dcl">over.dcl</a>]</td>
     <td>CD6</td>
     <td>Use of &#8220;equivalent&#8221; in overload resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1899">
     <td><a href="https://cplusplus.github.io/CWG/issues/1899.html">1899</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Value-dependent constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1900">
     <td><a href="https://cplusplus.github.io/CWG/issues/1900.html">1900</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD6</td>
     <td>Do <TT>friend</TT> declarations count as &#8220;previous declarations&#8221;?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1901">
     <td><a href="https://cplusplus.github.io/CWG/issues/1901.html">1901</a></td>
+    <td>[<a href="https://wg21.link/lex.token">lex.token</a>]</td>
     <td>open</td>
     <td><I>punctuator</I> referenced but not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1902">
     <td><a href="https://cplusplus.github.io/CWG/issues/1902.html">1902</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD4</td>
     <td>What makes a conversion &#8220;otherwise ill-formed&#8221;?</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1903">
     <td><a href="https://cplusplus.github.io/CWG/issues/1903.html">1903</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD4</td>
     <td>What declarations are introduced by a non-member <I>using-declaration</I>?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1904">
     <td><a href="https://cplusplus.github.io/CWG/issues/1904.html">1904</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Default template arguments for members of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1905">
     <td><a href="https://cplusplus.github.io/CWG/issues/1905.html">1905</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Dependent types and injected-class-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1906">
     <td><a href="https://cplusplus.github.io/CWG/issues/1906.html">1906</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Name lookup in member <TT>friend</TT> declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1907">
     <td><a href="https://cplusplus.github.io/CWG/issues/1907.html">1907</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s and default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1908">
     <td><a href="https://cplusplus.github.io/CWG/issues/1908.html">1908</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Dual destructor lookup and <I>template-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1909">
     <td><a href="https://cplusplus.github.io/CWG/issues/1909.html">1909</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Member class template with the same name as the class</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1910">
     <td><a href="https://cplusplus.github.io/CWG/issues/1910.html">1910</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD5</td>
     <td>&#8220;Shall&#8221; requirement applied to runtime behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1911">
     <td><a href="https://cplusplus.github.io/CWG/issues/1911.html">1911</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td><TT>constexpr</TT> constructor with non-literal base class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1912">
     <td><a href="https://cplusplus.github.io/CWG/issues/1912.html">1912</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td><I>exception-specification</I> of defaulted function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1913">
     <td><a href="https://cplusplus.github.io/CWG/issues/1913.html">1913</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD5</td>
     <td><TT>decltype((x))</TT> in <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1914">
     <td><a href="https://cplusplus.github.io/CWG/issues/1914.html">1914</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>extension</td>
     <td>Duplicate standard attributes</td>
     <td align="center">Extension</td>
   </tr>
   <tr class="open" id="1915">
     <td><a href="https://cplusplus.github.io/CWG/issues/1915.html">1915</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Potentially-invoked destructors in non-throwing constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1916">
     <td><a href="https://cplusplus.github.io/CWG/issues/1916.html">1916</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>&#8220;Same cv-unqualified type&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1917">
     <td><a href="https://cplusplus.github.io/CWG/issues/1917.html">1917</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>decltype-qualified enumeration names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1918">
     <td><a href="https://cplusplus.github.io/CWG/issues/1918.html">1918</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td><TT>friend</TT> templates with dependent scopes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="1919">
     <td><a href="https://cplusplus.github.io/CWG/issues/1919.html">1919</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Overload resolution for <TT>!</TT> with explicit conversion operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1920">
     <td><a href="https://cplusplus.github.io/CWG/issues/1920.html">1920</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>CD4</td>
     <td>Qualification mismatch in <I>pseudo-destructor-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1921">
     <td><a href="https://cplusplus.github.io/CWG/issues/1921.html">1921</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> constructors and point of initialization of <TT>const</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1922">
     <td><a href="https://cplusplus.github.io/CWG/issues/1922.html">1922</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>CD4</td>
     <td>Injected class template names and default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1923">
     <td><a href="https://cplusplus.github.io/CWG/issues/1923.html">1923</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Lvalues of type <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1924">
     <td><a href="https://cplusplus.github.io/CWG/issues/1924.html">1924</a></td>
+    <td>[<a href="https://wg21.link/lex.literal">lex.literal</a>]</td>
     <td>review</td>
     <td>Definition of &#8220;literal&#8221; and kinds of literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1925">
     <td><a href="https://cplusplus.github.io/CWG/issues/1925.html">1925</a></td>
+    <td>[<a href="https://wg21.link/expr.comma">expr.comma</a>]</td>
     <td>CD4</td>
     <td>Bit-field prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1926">
     <td><a href="https://cplusplus.github.io/CWG/issues/1926.html">1926</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Potential results of subscript operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1927">
     <td><a href="https://cplusplus.github.io/CWG/issues/1927.html">1927</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>dup</td>
     <td>Lifetime of temporaries in <I>init-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1928">
     <td><a href="https://cplusplus.github.io/CWG/issues/1928.html">1928</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Triviality of deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1929">
     <td><a href="https://cplusplus.github.io/CWG/issues/1929.html">1929</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD4</td>
     <td><TT>template</TT> keyword following namespace <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1930">
     <td><a href="https://cplusplus.github.io/CWG/issues/1930.html">1930</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><I>init-declarator-list</I> vs <I>member-declarator-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1931">
     <td><a href="https://cplusplus.github.io/CWG/issues/1931.html">1931</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD5</td>
     <td>Default-constructible and copy-assignable closure types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1932">
     <td><a href="https://cplusplus.github.io/CWG/issues/1932.html">1932</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Bit-field results of conditional operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1933">
     <td><a href="https://cplusplus.github.io/CWG/issues/1933.html">1933</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>NAD</td>
     <td>Implementation limit for <I>initializer-list</I> elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1934">
     <td><a href="https://cplusplus.github.io/CWG/issues/1934.html">1934</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Relaxing <I>exception-specification</I> compatibility requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1935">
     <td><a href="https://cplusplus.github.io/CWG/issues/1935.html">1935</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Reuse of placement arguments in deallocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1936">
     <td><a href="https://cplusplus.github.io/CWG/issues/1936.html">1936</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD6</td>
     <td>Dependent <I>qualified-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1937">
     <td><a href="https://cplusplus.github.io/CWG/issues/1937.html">1937</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD5</td>
     <td>Incomplete specification of function pointer from lambda</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1938">
     <td><a href="https://cplusplus.github.io/CWG/issues/1938.html">1938</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance">intro.compliance</a>]</td>
     <td>CD5</td>
     <td>Should hosted/freestanding be implementation-defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1939">
     <td><a href="https://cplusplus.github.io/CWG/issues/1939.html">1939</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Argument conversions to nondeduced parameter types revisited</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1940">
     <td><a href="https://cplusplus.github.io/CWG/issues/1940.html">1940</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD4</td>
     <td><TT>static_assert</TT> in anonymous unions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1941">
     <td><a href="https://cplusplus.github.io/CWG/issues/1941.html">1941</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>SFINAE and inherited constructor default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1942">
     <td><a href="https://cplusplus.github.io/CWG/issues/1942.html">1942</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD4</td>
     <td>Incorrect reference to <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1943">
     <td><a href="https://cplusplus.github.io/CWG/issues/1943.html">1943</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Unspecified meaning of &#8220;bit&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1944">
     <td><a href="https://cplusplus.github.io/CWG/issues/1944.html">1944</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>open</td>
     <td>New C incompatibilities</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1945">
     <td><a href="https://cplusplus.github.io/CWG/issues/1945.html">1945</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Friend declarations naming members of class templates in non-templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1946">
     <td><a href="https://cplusplus.github.io/CWG/issues/1946.html">1946</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s vs pointer dereference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1947">
     <td><a href="https://cplusplus.github.io/CWG/issues/1947.html">1947</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>NAD</td>
     <td>Digit separators following non-octal prefix</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1948">
     <td><a href="https://cplusplus.github.io/CWG/issues/1948.html">1948</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic">basic.stc.dynamic</a>]</td>
     <td>NAD</td>
     <td><I>exception-specification</I> of replacement global <TT>new</TT></td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1949">
     <td><a href="https://cplusplus.github.io/CWG/issues/1949.html">1949</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD4</td>
     <td>&#8220;sequenced after&#8221; instead of &#8220;sequenced before&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1950">
     <td><a href="https://cplusplus.github.io/CWG/issues/1950.html">1950</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>NAD</td>
     <td>Restructuring description of ranks of conversion sequences</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1951">
     <td><a href="https://cplusplus.github.io/CWG/issues/1951.html">1951</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD4</td>
     <td>Cv-qualification and literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1952">
     <td><a href="https://cplusplus.github.io/CWG/issues/1952.html">1952</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Constant expressions and library undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1953">
     <td><a href="https://cplusplus.github.io/CWG/issues/1953.html">1953</a></td>
+    <td>[<a href="https://wg21.link/intro.memory">intro.memory</a>]</td>
     <td>CD7</td>
     <td>Data races and common initial sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD7</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1955">
     <td><a href="https://cplusplus.github.io/CWG/issues/1955.html">1955</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD4</td>
     <td><TT>#elif</TT> with invalid controlling expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1956">
     <td><a href="https://cplusplus.github.io/CWG/issues/1956.html">1956</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.auto">basic.stc.auto</a>]</td>
     <td>CD4</td>
     <td>Reuse of storage of automatic variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1957">
     <td><a href="https://cplusplus.github.io/CWG/issues/1957.html">1957</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td><TT>decltype(auto)</TT> with direct-list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1958">
     <td><a href="https://cplusplus.github.io/CWG/issues/1958.html">1958</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> with parenthesized initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1959">
     <td><a href="https://cplusplus.github.io/CWG/issues/1959.html">1959</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inadvertently inherited copy constructor</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1960">
     <td><a href="https://cplusplus.github.io/CWG/issues/1960.html">1960</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td>Visibility of entity named in class-scope <I>using-declaration</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1961">
     <td><a href="https://cplusplus.github.io/CWG/issues/1961.html">1961</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++17</td>
     <td>Potentially-concurrent actions within a signal handler</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1962">
     <td><a href="https://cplusplus.github.io/CWG/issues/1962.html">1962</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td>Type of <TT>__func__</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1963">
     <td><a href="https://cplusplus.github.io/CWG/issues/1963.html">1963</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD4</td>
     <td>Implementation-defined identifier characters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1964">
     <td><a href="https://cplusplus.github.io/CWG/issues/1964.html">1964</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>NAD</td>
     <td><I>opaque-enum-declaration</I> in <I>alias-declaration</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1965">
     <td><a href="https://cplusplus.github.io/CWG/issues/1965.html">1965</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD7</td>
     <td>Explicit casts to reference types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1966">
     <td><a href="https://cplusplus.github.io/CWG/issues/1966.html">1966</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Colon following enumeration <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1967">
     <td><a href="https://cplusplus.github.io/CWG/issues/1967.html">1967</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>CD4</td>
     <td>Temporary lifetime and move-elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1968">
     <td><a href="https://cplusplus.github.io/CWG/issues/1968.html">1968</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Address of <TT>typeid</TT> in constant expressions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1969">
     <td><a href="https://cplusplus.github.io/CWG/issues/1969.html">1969</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD6</td>
     <td>Missing exclusion of <TT>~S</TT> as an ordinary function name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1970">
     <td><a href="https://cplusplus.github.io/CWG/issues/1970.html">1970</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Ambiguity resolution for <TT>(T())*x</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1971">
     <td><a href="https://cplusplus.github.io/CWG/issues/1971.html">1971</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD4</td>
     <td>Unclear disambiguation of destructor and <TT>operator~</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1972">
     <td><a href="https://cplusplus.github.io/CWG/issues/1972.html">1972</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD6</td>
     <td>Identifier character restrictions in non-<I>identifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1973">
     <td><a href="https://cplusplus.github.io/CWG/issues/1973.html">1973</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Which <I>parameter-declaration-clause</I> in a <I>lambda-expression</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1974">
     <td><a href="https://cplusplus.github.io/CWG/issues/1974.html">1974</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Redundant specification of non-type <I>typename-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1975">
     <td><a href="https://cplusplus.github.io/CWG/issues/1975.html">1975</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Permissible declarations for <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1976">
     <td><a href="https://cplusplus.github.io/CWG/issues/1976.html">1976</a></td>
+    <td>[<a href="https://wg21.link/namespace.alias">namespace.alias</a>]</td>
     <td>NAD</td>
     <td>Ambiguity of <I>namespace-alias</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1977">
     <td><a href="https://cplusplus.github.io/CWG/issues/1977.html">1977</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Contradictory results of failed destructor lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1978">
     <td><a href="https://cplusplus.github.io/CWG/issues/1978.html">1978</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>CD4</td>
     <td>Redundant description of explicit constructor use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1979">
     <td><a href="https://cplusplus.github.io/CWG/issues/1979.html">1979</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Alias template specialization in template member definition</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1980">
     <td><a href="https://cplusplus.github.io/CWG/issues/1980.html">1980</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Equivalent but not functionally-equivalent redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1981">
     <td><a href="https://cplusplus.github.io/CWG/issues/1981.html">1981</a></td>
+    <td>[<a href="https://wg21.link/conv">conv</a>]</td>
     <td>CD4</td>
     <td>Implicit contextual conversions and <TT>explicit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1982">
     <td><a href="https://cplusplus.github.io/CWG/issues/1982.html">1982</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Deduction extending parameter pack</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1983">
     <td><a href="https://cplusplus.github.io/CWG/issues/1983.html">1983</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td>Inappropriate use of <I>virt-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1984">
     <td><a href="https://cplusplus.github.io/CWG/issues/1984.html">1984</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Lossless narrowing conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1985">
     <td><a href="https://cplusplus.github.io/CWG/issues/1985.html">1985</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Unknown bound array member with <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1986">
     <td><a href="https://cplusplus.github.io/CWG/issues/1986.html">1986</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>drafting</td>
     <td>odr-use and delayed initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1987">
     <td><a href="https://cplusplus.github.io/CWG/issues/1987.html">1987</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> static data members across translation units</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1988">
     <td><a href="https://cplusplus.github.io/CWG/issues/1988.html">1988</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Ambiguity between dependent and non-dependent bases in implicit member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1989">
     <td><a href="https://cplusplus.github.io/CWG/issues/1989.html">1989</a></td>
+    <td>[<a href="https://wg21.link/over.oper">over.oper</a>]</td>
     <td>drafting</td>
     <td>Insufficient restrictions on parameters of postfix operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1990">
     <td><a href="https://cplusplus.github.io/CWG/issues/1990.html">1990</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD4</td>
     <td>Ambiguity due to optional <I>decl-specifier-seq</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1991">
     <td><a href="https://cplusplus.github.io/CWG/issues/1991.html">1991</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inheriting constructors vs default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1992">
     <td><a href="https://cplusplus.github.io/CWG/issues/1992.html">1992</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td><TT>new (std::nothrow) int[N]</TT> can throw</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1993">
     <td><a href="https://cplusplus.github.io/CWG/issues/1993.html">1993</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>open</td>
     <td>Use of <TT>template&lt;&gt;</TT> defining member of explicit specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1994">
     <td><a href="https://cplusplus.github.io/CWG/issues/1994.html">1994</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>dup</td>
     <td>Confusing wording regarding multiple <TT>template&lt;&gt;</TT> prefixes</td>
     <td class="unknown" align="center">Duplicate of <a href="#529">529</a></td>
   </tr>
   <tr id="1995">
     <td><a href="https://cplusplus.github.io/CWG/issues/1995.html">1995</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1996">
     <td><a href="https://cplusplus.github.io/CWG/issues/1996.html">1996</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>drafting</td>
     <td>Reference list-initialization ignores conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1997">
     <td><a href="https://cplusplus.github.io/CWG/issues/1997.html">1997</a></td>
+    <td>[<a href="https://wg21.link/basic.indet">basic.indet</a>]</td>
     <td>CD7</td>
     <td>Placement new and previous initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1998">
     <td><a href="https://cplusplus.github.io/CWG/issues/1998.html">1998</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Additional sources of xvalue expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1999">
     <td><a href="https://cplusplus.github.io/CWG/issues/1999.html">1999</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD4</td>
     <td>Representation of source characters as universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2000">
     <td><a href="https://cplusplus.github.io/CWG/issues/2000.html">2000</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>CD4</td>
     <td><I>header-name</I> outside <TT>#include</TT> directive</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2001">
     <td><a href="https://cplusplus.github.io/CWG/issues/2001.html">2001</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD4</td>
     <td><I>non-directive</I> is underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2002">
     <td><a href="https://cplusplus.github.io/CWG/issues/2002.html">2002</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>open</td>
     <td>White space within preprocessing directives</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2003">
     <td><a href="https://cplusplus.github.io/CWG/issues/2003.html">2003</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>drafting</td>
     <td>Zero-argument macros incorrectly specified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2004">
     <td><a href="https://cplusplus.github.io/CWG/issues/2004.html">2004</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Unions with mutable members in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2005">
     <td><a href="https://cplusplus.github.io/CWG/issues/2005.html">2005</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Incorrect <TT>constexpr</TT> reference initialization requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2006">
     <td><a href="https://cplusplus.github.io/CWG/issues/2006.html">2006</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD4</td>
     <td>Cv-qualified <TT>void</TT> types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2007">
     <td><a href="https://cplusplus.github.io/CWG/issues/2007.html">2007</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD6</td>
     <td>Argument-dependent lookup for <TT>operator=</TT></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="2008">
     <td><a href="https://cplusplus.github.io/CWG/issues/2008.html">2008</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD4</td>
     <td>Default <I>template-argument</I>s underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2009">
     <td><a href="https://cplusplus.github.io/CWG/issues/2009.html">2009</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD6</td>
     <td>Unclear specification of class scope</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2010">
     <td><a href="https://cplusplus.github.io/CWG/issues/2010.html">2010</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and conversion operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2011">
     <td><a href="https://cplusplus.github.io/CWG/issues/2011.html">2011</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Unclear effect of reference capture of reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2012">
     <td><a href="https://cplusplus.github.io/CWG/issues/2012.html">2012</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD4</td>
     <td>Lifetime of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2013">
     <td><a href="https://cplusplus.github.io/CWG/issues/2013.html">2013</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>drafting</td>
     <td>Pointer subtraction in large array</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2014">
     <td><a href="https://cplusplus.github.io/CWG/issues/2014.html">2014</a></td>
+    <td>[<a href="https://wg21.link/new.delete.array">new.delete.array</a>]</td>
     <td>NAD</td>
     <td>Unneeded deallocation signatures</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2015">
     <td><a href="https://cplusplus.github.io/CWG/issues/2015.html">2015</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.delete">dcl.fct.def.delete</a>]</td>
     <td>CD4</td>
     <td>odr-use of deleted virtual functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2016">
     <td><a href="https://cplusplus.github.io/CWG/issues/2016.html">2016</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD4</td>
     <td>Confusing wording in description of conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2017">
     <td><a href="https://cplusplus.github.io/CWG/issues/2017.html">2017</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD4</td>
     <td>Flowing off end is not equivalent to no-expression return</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2018">
     <td><a href="https://cplusplus.github.io/CWG/issues/2018.html">2018</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>dup</td>
     <td>Qualification conversion vs reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2019">
     <td><a href="https://cplusplus.github.io/CWG/issues/2019.html">2019</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.general">basic.stc.general</a>]</td>
     <td>CD4</td>
     <td>Member references omitted from description of storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2020">
     <td><a href="https://cplusplus.github.io/CWG/issues/2020.html">2020</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Inadequate description of odr-use of implicitly-invoked functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2021">
     <td><a href="https://cplusplus.github.io/CWG/issues/2021.html">2021</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>dup</td>
     <td>Function template redeclaration via alias template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2022">
     <td><a href="https://cplusplus.github.io/CWG/issues/2022.html">2022</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Copy elision in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2023">
     <td><a href="https://cplusplus.github.io/CWG/issues/2023.html">2023</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>drafting</td>
     <td>Composite reference result type of conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2024">
     <td><a href="https://cplusplus.github.io/CWG/issues/2024.html">2024</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Dependent types and unexpanded parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2025">
     <td><a href="https://cplusplus.github.io/CWG/issues/2025.html">2025</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>dup</td>
     <td>Declaration matching via alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2026">
     <td><a href="https://cplusplus.github.io/CWG/issues/2026.html">2026</a></td>
+    <td>[<a href="https://wg21.link/basic.start">basic.start</a>]</td>
     <td>CD4</td>
     <td>Zero-initialization and <TT>constexpr</TT></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2027">
     <td><a href="https://cplusplus.github.io/CWG/issues/2027.html">2027</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD4</td>
     <td>Unclear requirements for multiple <TT>alignas</TT> specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2028">
     <td><a href="https://cplusplus.github.io/CWG/issues/2028.html">2028</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>drafting</td>
     <td>Converting constructors in rvalue reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2029">
     <td><a href="https://cplusplus.github.io/CWG/issues/2029.html">2029</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>dup</td>
     <td>Abstract class return type in <TT>decltype</TT> operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2030">
     <td><a href="https://cplusplus.github.io/CWG/issues/2030.html">2030</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Access of injected-class-name with template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2031">
     <td><a href="https://cplusplus.github.io/CWG/issues/2031.html">2031</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.expr">diff.cpp03.expr</a>]</td>
     <td>CD4</td>
     <td>Missing incompatibility for <TT>&amp;&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2032">
     <td><a href="https://cplusplus.github.io/CWG/issues/2032.html">2032</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD4</td>
     <td>Default <I>template-argument</I>s of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2033">
     <td><a href="https://cplusplus.github.io/CWG/issues/2033.html">2033</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Redundant restriction on partial specialization argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2034">
     <td><a href="https://cplusplus.github.io/CWG/issues/2034.html">2034</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>NAD</td>
     <td>Deprecating <TT>uncaught_exception()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2035">
     <td><a href="https://cplusplus.github.io/CWG/issues/2035.html">2035</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.match">temp.spec.partial.match</a>]</td>
     <td>CD3</td>
     <td>Multi-section example is confusing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2036">
     <td><a href="https://cplusplus.github.io/CWG/issues/2036.html">2036</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>NAD</td>
     <td>Refactoring <I>parameters-and-qualifiers</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2037">
     <td><a href="https://cplusplus.github.io/CWG/issues/2037.html">2037</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>drafting</td>
     <td>Alias templates and template declaration matching</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2038">
     <td><a href="https://cplusplus.github.io/CWG/issues/2038.html">2038</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp14">diff.cpp14</a>]</td>
     <td>CD4</td>
     <td>Document C++14 incompatibility of new braced deduction rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2039">
     <td><a href="https://cplusplus.github.io/CWG/issues/2039.html">2039</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Constant conversions to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2040">
     <td><a href="https://cplusplus.github.io/CWG/issues/2040.html">2040</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD4</td>
     <td><I>trailing-return-type</I> no longer ambiguous</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2041">
     <td><a href="https://cplusplus.github.io/CWG/issues/2041.html">2041</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD4</td>
     <td>Namespace for explicit class template specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2042">
     <td><a href="https://cplusplus.github.io/CWG/issues/2042.html">2042</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>review</td>
     <td>Exceptions and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2043">
     <td><a href="https://cplusplus.github.io/CWG/issues/2043.html">2043</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>drafting</td>
     <td>Generalized template arguments and array-to-pointer decay</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2044">
     <td><a href="https://cplusplus.github.io/CWG/issues/2044.html">2044</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> and <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2045">
     <td><a href="https://cplusplus.github.io/CWG/issues/2045.html">2045</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>CD5</td>
     <td>&#8220;Identical&#8221; template parameter lists</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2046">
     <td><a href="https://cplusplus.github.io/CWG/issues/2046.html">2046</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++17</td>
     <td>Incomplete thread specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2047">
     <td><a href="https://cplusplus.github.io/CWG/issues/2047.html">2047</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Coordinating &#8220;throws anything&#8221; specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2048">
     <td><a href="https://cplusplus.github.io/CWG/issues/2048.html">2048</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>open</td>
     <td>C-style casts that cast away constness vs <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD7</td>
     <td>List initializer in non-type template default argument</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>NAD</td>
     <td>Consolidate specification of linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2051">
     <td><a href="https://cplusplus.github.io/CWG/issues/2051.html">2051</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD5</td>
     <td>Simplifying alias rules</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2052">
     <td><a href="https://cplusplus.github.io/CWG/issues/2052.html">2052</a></td>
+    <td>[<a href="https://wg21.link/over.oper">over.oper</a>]</td>
     <td>CD4</td>
     <td>Template argument deduction vs overloaded operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2053">
     <td><a href="https://cplusplus.github.io/CWG/issues/2053.html">2053</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++20</td>
     <td><TT>auto</TT> in non-generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD7</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2055">
     <td><a href="https://cplusplus.github.io/CWG/issues/2055.html">2055</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicitly-specified non-deduced parameter packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2056">
     <td><a href="https://cplusplus.github.io/CWG/issues/2056.html">2056</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Member function calls in partially-initialized class objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2057">
     <td><a href="https://cplusplus.github.io/CWG/issues/2057.html">2057</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>drafting</td>
     <td>Template template arguments with default arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2058">
     <td><a href="https://cplusplus.github.io/CWG/issues/2058.html">2058</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>More errors from internal-linkage namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2059">
     <td><a href="https://cplusplus.github.io/CWG/issues/2059.html">2059</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD5</td>
     <td>Linkage and deduced return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2060">
     <td><a href="https://cplusplus.github.io/CWG/issues/2060.html">2060</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Deduced return type for explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2061">
     <td><a href="https://cplusplus.github.io/CWG/issues/2061.html">2061</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Inline namespace after simplifications</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2062">
     <td><a href="https://cplusplus.github.io/CWG/issues/2062.html">2062</a></td>
+    <td>[<a href="https://wg21.link/temp.class">temp.class</a>]</td>
     <td>CD6</td>
     <td>Class template redeclaration requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2063">
     <td><a href="https://cplusplus.github.io/CWG/issues/2063.html">2063</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD4</td>
     <td>Type/nontype hiding in class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2064">
     <td><a href="https://cplusplus.github.io/CWG/issues/2064.html">2064</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD4</td>
     <td>Conflicting specifications for dependent <I>decltype-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2065">
     <td><a href="https://cplusplus.github.io/CWG/issues/2065.html">2065</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Current instantiation of a partial specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2066">
     <td><a href="https://cplusplus.github.io/CWG/issues/2066.html">2066</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Does type-dependent imply value-dependent?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2067">
     <td><a href="https://cplusplus.github.io/CWG/issues/2067.html">2067</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>open</td>
     <td>Generated variadic templates requiring empty pack</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2068">
     <td><a href="https://cplusplus.github.io/CWG/issues/2068.html">2068</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>When can/must a defaulted virtual destructor be defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2069">
     <td><a href="https://cplusplus.github.io/CWG/issues/2069.html">2069</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Do destructors have names?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2070">
     <td><a href="https://cplusplus.github.io/CWG/issues/2070.html">2070</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I> with dependent <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2071">
     <td><a href="https://cplusplus.github.io/CWG/issues/2071.html">2071</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD4</td>
     <td><TT>typedef</TT> with no declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2072">
     <td><a href="https://cplusplus.github.io/CWG/issues/2072.html">2072</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>C++23</td>
     <td>Default argument instantiation for member functions of templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2073">
     <td><a href="https://cplusplus.github.io/CWG/issues/2073.html">2073</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>open</td>
     <td>Allocating memory for exception objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2074">
     <td><a href="https://cplusplus.github.io/CWG/issues/2074.html">2074</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Type-dependence of local class of function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2075">
     <td><a href="https://cplusplus.github.io/CWG/issues/2075.html">2075</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD4</td>
     <td>Passing short initializer lists to array reference parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2076">
     <td><a href="https://cplusplus.github.io/CWG/issues/2076.html">2076</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD4</td>
     <td>List-initialization of arguments for constructor parameters</td>
     <td class="full" align="center">Clang 13</td>
   </tr>
   <tr class="open" id="2077">
     <td><a href="https://cplusplus.github.io/CWG/issues/2077.html">2077</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>drafting</td>
     <td>Overload resolution and invalid rvalue-reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2078">
     <td><a href="https://cplusplus.github.io/CWG/issues/2078.html">2078</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>NAD</td>
     <td>Name lookup of <I>mem-initilizer-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2079">
     <td><a href="https://cplusplus.github.io/CWG/issues/2079.html">2079</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD4</td>
     <td><TT>[[</TT> appearing in a <I>balanced-token-seq</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2080">
     <td><a href="https://cplusplus.github.io/CWG/issues/2080.html">2080</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD5</td>
     <td>Example with empty anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2081">
     <td><a href="https://cplusplus.github.io/CWG/issues/2081.html">2081</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD5</td>
     <td>Deduced return type in redeclaration or specialization of function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2082">
     <td><a href="https://cplusplus.github.io/CWG/issues/2082.html">2082</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD4</td>
     <td>Referring to parameters in unevaluated operands of default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2083">
     <td><a href="https://cplusplus.github.io/CWG/issues/2083.html">2083</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Incorrect cases of odr-use</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2084">
     <td><a href="https://cplusplus.github.io/CWG/issues/2084.html">2084</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD4</td>
     <td>NSDMIs and deleted union default constructors</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2085">
     <td><a href="https://cplusplus.github.io/CWG/issues/2085.html">2085</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Invalid example of adding special member function via default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2086">
     <td><a href="https://cplusplus.github.io/CWG/issues/2086.html">2086</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>drafting</td>
     <td>Reference odr-use vs implicit capture</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2087">
     <td><a href="https://cplusplus.github.io/CWG/issues/2087.html">2087</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>NAD</td>
     <td>Left shift of negative value by zero bits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2088">
     <td><a href="https://cplusplus.github.io/CWG/issues/2088.html">2088</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD5</td>
     <td>Late tiebreakers in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2089">
     <td><a href="https://cplusplus.github.io/CWG/issues/2089.html">2089</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>drafting</td>
     <td>Restricting selection of builtin overloaded operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2090">
     <td><a href="https://cplusplus.github.io/CWG/issues/2090.html">2090</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.temp">temp.dep.temp</a>]</td>
     <td>open</td>
     <td>Dependency via non-dependent base class</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2091">
     <td><a href="https://cplusplus.github.io/CWG/issues/2091.html">2091</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD4</td>
     <td>Deducing reference non-type template arguments</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="2092">
     <td><a href="https://cplusplus.github.io/CWG/issues/2092.html">2092</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>CD5</td>
     <td>Deduction failure and overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2093">
     <td><a href="https://cplusplus.github.io/CWG/issues/2093.html">2093</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD4</td>
     <td>Qualification conversion for pointer-to-member handler matching</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2094">
     <td><a href="https://cplusplus.github.io/CWG/issues/2094.html">2094</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++17</td>
     <td>Trivial copy/move constructor for class with volatile member</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="2095">
     <td><a href="https://cplusplus.github.io/CWG/issues/2095.html">2095</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD4</td>
     <td>Capturing rvalue references to functions by copy</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2096">
     <td><a href="https://cplusplus.github.io/CWG/issues/2096.html">2096</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD4</td>
     <td>Constraints on literal unions</td>
     <td class="full" align="center">Duplicate of <a href="#2598">2598</a></td>
   </tr>
   <tr class="open" id="2097">
     <td><a href="https://cplusplus.github.io/CWG/issues/2097.html">2097</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>extension</td>
     <td>Lambdas and <TT>noreturn</TT> attribute</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2098">
     <td><a href="https://cplusplus.github.io/CWG/issues/2098.html">2098</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>CD4</td>
     <td>Is <TT>uncaught_exceptions()</TT> per-thread?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2099">
     <td><a href="https://cplusplus.github.io/CWG/issues/2099.html">2099</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD4</td>
     <td>Inferring the bound of an array static data member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2100">
     <td><a href="https://cplusplus.github.io/CWG/issues/2100.html">2100</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++17</td>
     <td>Value-dependent address of static data member of class template</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2101">
     <td><a href="https://cplusplus.github.io/CWG/issues/2101.html">2101</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>Incorrect description of type- and value-dependence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD7</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2103">
     <td><a href="https://cplusplus.github.io/CWG/issues/2103.html">2103</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2104">
     <td><a href="https://cplusplus.github.io/CWG/issues/2104.html">2104</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Internal-linkage <TT>constexpr</TT> references and ODR requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2105">
     <td><a href="https://cplusplus.github.io/CWG/issues/2105.html">2105</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>open</td>
     <td>When do the arguments for a parameter pack end?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2106">
     <td><a href="https://cplusplus.github.io/CWG/issues/2106.html">2106</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.type">temp.arg.type</a>]</td>
     <td>CD4</td>
     <td>Unclear restrictions on use of function-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2107">
     <td><a href="https://cplusplus.github.io/CWG/issues/2107.html">2107</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Lifetime of temporaries for default arguments in array copying</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2108">
     <td><a href="https://cplusplus.github.io/CWG/issues/2108.html">2108</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>drafting</td>
     <td>Conversions to non-class prvalues in reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2109">
     <td><a href="https://cplusplus.github.io/CWG/issues/2109.html">2109</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Value dependence underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2110">
     <td><a href="https://cplusplus.github.io/CWG/issues/2110.html">2110</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>drafting</td>
     <td>Overload resolution for base class conversion and reference/non-reference</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2111">
     <td><a href="https://cplusplus.github.io/CWG/issues/2111.html">2111</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Array temporaries in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2112">
     <td><a href="https://cplusplus.github.io/CWG/issues/2112.html">2112</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td><TT>new auto{x}</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2113">
     <td><a href="https://cplusplus.github.io/CWG/issues/2113.html">2113</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD4</td>
     <td>Incompete specification of types for declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2114">
     <td><a href="https://cplusplus.github.io/CWG/issues/2114.html">2114</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp11.dcl.decl">diff.cpp11.dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Missing description of incompatibility from aggregate NSDMIs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2115">
     <td><a href="https://cplusplus.github.io/CWG/issues/2115.html">2115</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>open</td>
     <td>Order of implicit destruction vs release of automatic storage</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2116">
     <td><a href="https://cplusplus.github.io/CWG/issues/2116.html">2116</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Direct or copy initialization for omitted aggregate initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2117">
     <td><a href="https://cplusplus.github.io/CWG/issues/2117.html">2117</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Explicit specializations and <TT>constexpr</TT> function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2118">
     <td><a href="https://cplusplus.github.io/CWG/issues/2118.html">2118</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>open</td>
     <td>Stateful metaprogramming via friend injection</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2119">
     <td><a href="https://cplusplus.github.io/CWG/issues/2119.html">2119</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>NAD</td>
     <td>Disambiguation of multi-level covariant return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2120">
     <td><a href="https://cplusplus.github.io/CWG/issues/2120.html">2120</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Array as first non-static data member in standard-layout class</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="2121">
     <td><a href="https://cplusplus.github.io/CWG/issues/2121.html">2121</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.general">expr.prim.lambda.general</a>]</td>
     <td>CD6</td>
     <td>More flexible lambda syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2122">
     <td><a href="https://cplusplus.github.io/CWG/issues/2122.html">2122</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD4</td>
     <td>Glvalues of <TT>void</TT> type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2123">
     <td><a href="https://cplusplus.github.io/CWG/issues/2123.html">2123</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>open</td>
     <td>Omitted constant initialization of local static variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2124">
     <td><a href="https://cplusplus.github.io/CWG/issues/2124.html">2124</a></td>
+    <td>[<a href="https://wg21.link/defns.signature.member.templ">defns.signature.member.templ</a>]</td>
     <td>CD4</td>
     <td>Signature of constructor template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2125">
     <td><a href="https://cplusplus.github.io/CWG/issues/2125.html">2125</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>NAD</td>
     <td>Copy elision and comma operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2126">
     <td><a href="https://cplusplus.github.io/CWG/issues/2126.html">2126</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Lifetime-extended temporaries in constant expressions</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr class="open" id="2127">
     <td><a href="https://cplusplus.github.io/CWG/issues/2127.html">2127</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>drafting</td>
     <td>Partial specialization and nullptr</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2128">
     <td><a href="https://cplusplus.github.io/CWG/issues/2128.html">2128</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>open</td>
     <td>Imprecise rule for reference member initializer</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2129">
     <td><a href="https://cplusplus.github.io/CWG/issues/2129.html">2129</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Non-object prvalues and constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2130">
     <td><a href="https://cplusplus.github.io/CWG/issues/2130.html">2130</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Over-aligned types in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2131">
     <td><a href="https://cplusplus.github.io/CWG/issues/2131.html">2131</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>drafting</td>
     <td>Ambiguity with <I>opaque-enum-declaration</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2132">
     <td><a href="https://cplusplus.github.io/CWG/issues/2132.html">2132</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Deprecated default generated copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2133">
     <td><a href="https://cplusplus.github.io/CWG/issues/2133.html">2133</a></td>
+    <td>[<a href="https://wg21.link/conv.fctptr">conv.fctptr</a>]</td>
     <td>CD5</td>
     <td>Converting <TT>std::nullptr_t</TT> to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2134">
     <td><a href="https://cplusplus.github.io/CWG/issues/2134.html">2134</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>NAD</td>
     <td>Objectless references to non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2135">
     <td><a href="https://cplusplus.github.io/CWG/issues/2135.html">2135</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td><I>mem-initializer</I>s for virtual bases of abstract classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2136">
     <td><a href="https://cplusplus.github.io/CWG/issues/2136.html">2136</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Argument-dependent lookup and initializer lists</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2137">
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Explicit member specialization vs implicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2139">
     <td><a href="https://cplusplus.github.io/CWG/issues/2139.html">2139</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>NAD</td>
     <td>Floating-point requirements for integer representation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2140">
     <td><a href="https://cplusplus.github.io/CWG/issues/2140.html">2140</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD4</td>
     <td>Lvalue-to-rvalue conversion of <TT>std::nullptr_t</TT></td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2141">
     <td><a href="https://cplusplus.github.io/CWG/issues/2141.html">2141</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Ambiguity in <I>new-expression</I> with <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2142">
     <td><a href="https://cplusplus.github.io/CWG/issues/2142.html">2142</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Missing definition of associated classes and namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2143">
     <td><a href="https://cplusplus.github.io/CWG/issues/2143.html">2143</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++17</td>
     <td>Value-dependency via injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2144">
     <td><a href="https://cplusplus.github.io/CWG/issues/2144.html">2144</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD7</td>
     <td>Function/variable declaration ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2145">
     <td><a href="https://cplusplus.github.io/CWG/issues/2145.html">2145</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD4</td>
     <td>Parenthesized declarator in function definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2146">
     <td><a href="https://cplusplus.github.io/CWG/issues/2146.html">2146</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD4</td>
     <td>Scalar object vs memory location in definition of &#8220;unsequenced&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2147">
     <td><a href="https://cplusplus.github.io/CWG/issues/2147.html">2147</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD4</td>
     <td>Initializer-list arguments and pack deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2148">
     <td><a href="https://cplusplus.github.io/CWG/issues/2148.html">2148</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>drafting</td>
     <td>Thread storage duration and order of initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD7</td>
     <td>Brace elision and array length deduction</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2150">
     <td><a href="https://cplusplus.github.io/CWG/issues/2150.html">2150</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Initializer list array lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2151">
     <td><a href="https://cplusplus.github.io/CWG/issues/2151.html">2151</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD4</td>
     <td>Exception object is not created</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2152">
     <td><a href="https://cplusplus.github.io/CWG/issues/2152.html">2152</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Can an alternative token be used as a <I>ud-suffix</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2153">
     <td><a href="https://cplusplus.github.io/CWG/issues/2153.html">2153</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td><I>pure-specifier</I> in friend declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2154">
     <td><a href="https://cplusplus.github.io/CWG/issues/2154.html">2154</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Ambiguity of <I>pure-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2155">
     <td><a href="https://cplusplus.github.io/CWG/issues/2155.html">2155</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>C++17</td>
     <td>Defining classes and enumerations via <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2156">
     <td><a href="https://cplusplus.github.io/CWG/issues/2156.html">2156</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Definition of enumeration declared by <I>using-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2157">
     <td><a href="https://cplusplus.github.io/CWG/issues/2157.html">2157</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD4</td>
     <td>Further disambiguation of enumeration <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr class="open" id="2158">
     <td><a href="https://cplusplus.github.io/CWG/issues/2158.html">2158</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>drafting</td>
     <td>Polymorphic behavior during destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2159">
     <td><a href="https://cplusplus.github.io/CWG/issues/2159.html">2159</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Lambda capture and local <TT>thread_local</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2160">
     <td><a href="https://cplusplus.github.io/CWG/issues/2160.html">2160</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Issues with partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2161">
     <td><a href="https://cplusplus.github.io/CWG/issues/2161.html">2161</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicit instantiation declaration and &#8220;preceding initialization&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2162">
     <td><a href="https://cplusplus.github.io/CWG/issues/2162.html">2162</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD3</td>
     <td>Capturing <TT>this</TT> by reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2163">
     <td><a href="https://cplusplus.github.io/CWG/issues/2163.html">2163</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td>Labels in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2164">
     <td><a href="https://cplusplus.github.io/CWG/issues/2164.html">2164</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.hiding">basic.scope.hiding</a>]</td>
     <td>CD5</td>
     <td>Name hiding and <I>using-directive</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2165">
     <td><a href="https://cplusplus.github.io/CWG/issues/2165.html">2165</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD6</td>
     <td>Namespaces, declarative regions, and translation units</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="2166">
     <td><a href="https://cplusplus.github.io/CWG/issues/2166.html">2166</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Unclear meaning of &#8220;undefined <TT>constexpr</TT> function&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2167">
     <td><a href="https://cplusplus.github.io/CWG/issues/2167.html">2167</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Non-member references with lifetimes within the current evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2168">
     <td><a href="https://cplusplus.github.io/CWG/issues/2168.html">2168</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>review</td>
     <td>Narrowing conversions and +/- infinity</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2169">
     <td><a href="https://cplusplus.github.io/CWG/issues/2169.html">2169</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Narrowing conversions and overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2170">
     <td><a href="https://cplusplus.github.io/CWG/issues/2170.html">2170</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Unclear definition of odr-use for arrays</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2171">
     <td><a href="https://cplusplus.github.io/CWG/issues/2171.html">2171</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Triviality of copy constructor with less-qualified parameter</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr class="open" id="2172">
     <td><a href="https://cplusplus.github.io/CWG/issues/2172.html">2172</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>drafting</td>
     <td>Multiple exceptions with one exception object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2173">
     <td><a href="https://cplusplus.github.io/CWG/issues/2173.html">2173</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>open</td>
     <td>Partial specialization with non-deduced contexts</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2174">
     <td><a href="https://cplusplus.github.io/CWG/issues/2174.html">2174</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>C++17</td>
     <td>Unclear rules for friend definitions in templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2175">
     <td><a href="https://cplusplus.github.io/CWG/issues/2175.html">2175</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>CD4</td>
     <td>Ambiguity with attribute in conversion operator declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2176">
     <td><a href="https://cplusplus.github.io/CWG/issues/2176.html">2176</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>Destroying the returned object when a destructor throws</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2177">
     <td><a href="https://cplusplus.github.io/CWG/issues/2177.html">2177</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Placement <TT>operator delete</TT> and parameter copies</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2178">
     <td><a href="https://cplusplus.github.io/CWG/issues/2178.html">2178</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Substitution of dependent template arguments in default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2179">
     <td><a href="https://cplusplus.github.io/CWG/issues/2179.html">2179</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>drafting</td>
     <td>Required diagnostic for partial specialization after first use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2180">
     <td><a href="https://cplusplus.github.io/CWG/issues/2180.html">2180</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD4</td>
     <td>Virtual bases in destructors and defaulted assignment operators</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2181">
     <td><a href="https://cplusplus.github.io/CWG/issues/2181.html">2181</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>C++20</td>
     <td>Normative requirements in an informative Annex</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2182">
     <td><a href="https://cplusplus.github.io/CWG/issues/2182.html">2182</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>drafting</td>
     <td>Pointer arithmetic in array-like containers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2183">
     <td><a href="https://cplusplus.github.io/CWG/issues/2183.html">2183</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Problems in description of potential exceptions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2184">
     <td><a href="https://cplusplus.github.io/CWG/issues/2184.html">2184</a></td>
+    <td>[<a href="https://wg21.link/diff.expr">diff.expr</a>]</td>
     <td>CD4</td>
     <td>Missing C compatibility entry for decrement of <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2185">
     <td><a href="https://cplusplus.github.io/CWG/issues/2185.html">2185</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD6</td>
     <td>Cv-qualified numeric types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2186">
     <td><a href="https://cplusplus.github.io/CWG/issues/2186.html">2186</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Unclear point that &#8220;preceding initialization&#8221; must precede</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2187">
     <td><a href="https://cplusplus.github.io/CWG/issues/2187.html">2187</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>drafting</td>
     <td>Protected members and access via <I>qualified-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2188">
     <td><a href="https://cplusplus.github.io/CWG/issues/2188.html">2188</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td><I>empty-declaration</I> grammar ambiguity</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2189">
     <td><a href="https://cplusplus.github.io/CWG/issues/2189.html">2189</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>open</td>
     <td>Surrogate call template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2190">
     <td><a href="https://cplusplus.github.io/CWG/issues/2190.html">2190</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Insufficient specification of <TT>__has_include</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2191">
     <td><a href="https://cplusplus.github.io/CWG/issues/2191.html">2191</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++17</td>
     <td>Incorrect result for <TT>noexcept(typeid(v))</TT></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2192">
     <td><a href="https://cplusplus.github.io/CWG/issues/2192.html">2192</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Constant expressions and order-of-eval undefined behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2193">
     <td><a href="https://cplusplus.github.io/CWG/issues/2193.html">2193</a></td>
+    <td>[<a href="https://wg21.link/numeric.limits.members">numeric.limits.members</a>]</td>
     <td>NAD</td>
     <td><TT>numeric_limits&lt;int&gt;::radix</TT> and <TT>digits</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2194">
     <td><a href="https://cplusplus.github.io/CWG/issues/2194.html">2194</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>drafting</td>
     <td>Impossible case in list initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2195">
     <td><a href="https://cplusplus.github.io/CWG/issues/2195.html">2195</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>open</td>
     <td>Unsolicited reading of trailing volatile members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2196">
     <td><a href="https://cplusplus.github.io/CWG/issues/2196.html">2196</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++17</td>
     <td>Zero-initialization with virtual base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2197">
     <td><a href="https://cplusplus.github.io/CWG/issues/2197.html">2197</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++17</td>
     <td>Overload resolution and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2198">
     <td><a href="https://cplusplus.github.io/CWG/issues/2198.html">2198</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++17</td>
     <td>Linkage of enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2199">
     <td><a href="https://cplusplus.github.io/CWG/issues/2199.html">2199</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td>Typedefs and tags</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="2200">
     <td><a href="https://cplusplus.github.io/CWG/issues/2200.html">2200</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Conversions in template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2201">
     <td><a href="https://cplusplus.github.io/CWG/issues/2201.html">2201</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>C++17</td>
     <td>Cv-qualification of array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2202">
     <td><a href="https://cplusplus.github.io/CWG/issues/2202.html">2202</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>When does default argument instantiation occur?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2203">
     <td><a href="https://cplusplus.github.io/CWG/issues/2203.html">2203</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Defaulted copy/move constructors and UDCs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2204">
     <td><a href="https://cplusplus.github.io/CWG/issues/2204.html">2204</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td>Naming delegated constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2205">
     <td><a href="https://cplusplus.github.io/CWG/issues/2205.html">2205</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++17</td>
     <td>Restrictions on use of <TT>alignas</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2206">
     <td><a href="https://cplusplus.github.io/CWG/issues/2206.html">2206</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++17</td>
     <td>Composite type of object and function pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2207">
     <td><a href="https://cplusplus.github.io/CWG/issues/2207.html">2207</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD5</td>
     <td>Alignment of allocation function return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2208">
     <td><a href="https://cplusplus.github.io/CWG/issues/2208.html">2208</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td><I>static_assert-declaration</I> does not declare a member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2209">
     <td><a href="https://cplusplus.github.io/CWG/issues/2209.html">2209</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Destruction of constructed array elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2210">
     <td><a href="https://cplusplus.github.io/CWG/issues/2210.html">2210</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Principal/target constructor confusion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2211">
     <td><a href="https://cplusplus.github.io/CWG/issues/2211.html">2211</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Hiding by lambda captures and parameters</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="2212">
     <td><a href="https://cplusplus.github.io/CWG/issues/2212.html">2212</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD5</td>
     <td>Typedef changing linkage after use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2213">
     <td><a href="https://cplusplus.github.io/CWG/issues/2213.html">2213</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD6</td>
     <td>Forward declaration of partial specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2214">
     <td><a href="https://cplusplus.github.io/CWG/issues/2214.html">2214</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++17</td>
     <td>Missing requirement on representation of integer values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2215">
     <td><a href="https://cplusplus.github.io/CWG/issues/2215.html">2215</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++17</td>
     <td>Redundant description of language linkage in function call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2216">
     <td><a href="https://cplusplus.github.io/CWG/issues/2216.html">2216</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Exception specifications in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2217">
     <td><a href="https://cplusplus.github.io/CWG/issues/2217.html">2217</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> constructors for non-literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2218">
     <td><a href="https://cplusplus.github.io/CWG/issues/2218.html">2218</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup">basic.lookup</a>]</td>
     <td>C++17</td>
     <td>Ambiguity and namespace aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2219">
     <td><a href="https://cplusplus.github.io/CWG/issues/2219.html">2219</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>drafting</td>
     <td>Dynamically-unreachable handlers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2220">
     <td><a href="https://cplusplus.github.io/CWG/issues/2220.html">2220</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>C++17</td>
     <td>Hiding index variable in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2221">
     <td><a href="https://cplusplus.github.io/CWG/issues/2221.html">2221</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD6</td>
     <td>Copying volatile objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2222">
     <td><a href="https://cplusplus.github.io/CWG/issues/2222.html">2222</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Additional contexts where instantiation is not required</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2223">
     <td><a href="https://cplusplus.github.io/CWG/issues/2223.html">2223</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>drafting</td>
     <td>Multiple <TT>alignas</TT> specifiers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2224">
     <td><a href="https://cplusplus.github.io/CWG/issues/2224.html">2224</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++17</td>
     <td>Member subobjects and base-class casts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2225">
     <td><a href="https://cplusplus.github.io/CWG/issues/2225.html">2225</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> to same floating-point type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2226">
     <td><a href="https://cplusplus.github.io/CWG/issues/2226.html">2226</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD5</td>
     <td>Xvalues vs lvalues in conditional expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2227">
     <td><a href="https://cplusplus.github.io/CWG/issues/2227.html">2227</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD5</td>
     <td>Destructor access and default member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2228">
     <td><a href="https://cplusplus.github.io/CWG/issues/2228.html">2228</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>review</td>
     <td>Ambiguity resolution for cast to function type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2229">
     <td><a href="https://cplusplus.github.io/CWG/issues/2229.html">2229</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Volatile unnamed bit-fields</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="2230">
     <td><a href="https://cplusplus.github.io/CWG/issues/2230.html">2230</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>Linkage of <TT>extern "C"</TT> function in unnamed namespace</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2231">
     <td><a href="https://cplusplus.github.io/CWG/issues/2231.html">2231</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>NAD</td>
     <td>Class member access to static data member template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2232">
     <td><a href="https://cplusplus.github.io/CWG/issues/2232.html">2232</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>open</td>
     <td><TT>thread_local</TT> anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2233">
     <td><a href="https://cplusplus.github.io/CWG/issues/2233.html">2233</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD5</td>
     <td>Function parameter packs following default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2234">
     <td><a href="https://cplusplus.github.io/CWG/issues/2234.html">2234</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD5</td>
     <td>Missing rules for <I>simple-template-id</I> as <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2235">
     <td><a href="https://cplusplus.github.io/CWG/issues/2235.html">2235</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD5</td>
     <td>Partial ordering and non-dependent types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2236">
     <td><a href="https://cplusplus.github.io/CWG/issues/2236.html">2236</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>When is an alias template specialization dependent?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2237">
     <td><a href="https://cplusplus.github.io/CWG/issues/2237.html">2237</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD5</td>
     <td>Can a <I>template-id</I> name a constructor?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2238">
     <td><a href="https://cplusplus.github.io/CWG/issues/2238.html">2238</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>NAD</td>
     <td>Contradictory alignment requirements for allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2239">
     <td><a href="https://cplusplus.github.io/CWG/issues/2239.html">2239</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Sized deallocation with a trivial destructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2240">
     <td><a href="https://cplusplus.github.io/CWG/issues/2240.html">2240</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td><TT>this</TT> is not odr-used in a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2241">
     <td><a href="https://cplusplus.github.io/CWG/issues/2241.html">2241</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD5</td>
     <td>Overload resolution is not invoked with a single function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2242">
     <td><a href="https://cplusplus.github.io/CWG/issues/2242.html">2242</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td>ODR violation with constant initialization possibly omitted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2243">
     <td><a href="https://cplusplus.github.io/CWG/issues/2243.html">2243</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>drafting</td>
     <td>Incorrect use of implicit conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2244">
     <td><a href="https://cplusplus.github.io/CWG/issues/2244.html">2244</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>open</td>
     <td>Base class access in aggregate initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2245">
     <td><a href="https://cplusplus.github.io/CWG/issues/2245.html">2245</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Point of instantiation of incomplete class template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2246">
     <td><a href="https://cplusplus.github.io/CWG/issues/2246.html">2246</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>drafting</td>
     <td>Access of indirect virtual base class constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2247">
     <td><a href="https://cplusplus.github.io/CWG/issues/2247.html">2247</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Lambda capture and variable argument list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2248">
     <td><a href="https://cplusplus.github.io/CWG/issues/2248.html">2248</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++17</td>
     <td>Problems with sized delete</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2249">
     <td><a href="https://cplusplus.github.io/CWG/issues/2249.html">2249</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>CD5</td>
     <td><I>identifier</I>s and <I>id-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2250">
     <td><a href="https://cplusplus.github.io/CWG/issues/2250.html">2250</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>open</td>
     <td>Implicit instantiation, destruction, and TUs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2251">
     <td><a href="https://cplusplus.github.io/CWG/issues/2251.html">2251</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++17</td>
     <td>Unreachable enumeration list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2253">
     <td><a href="https://cplusplus.github.io/CWG/issues/2253.html">2253</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Unnamed bit-fields and zero-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2254">
     <td><a href="https://cplusplus.github.io/CWG/issues/2254.html">2254</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td>Standard-layout classes and bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2255">
     <td><a href="https://cplusplus.github.io/CWG/issues/2255.html">2255</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD5</td>
     <td>Instantiated static data member templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2256">
     <td><a href="https://cplusplus.github.io/CWG/issues/2256.html">2256</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD5</td>
     <td>Lifetime of trivially-destructible objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2257">
     <td><a href="https://cplusplus.github.io/CWG/issues/2257.html">2257</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD5</td>
     <td>Lifetime extension of references vs exceptions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2258">
     <td><a href="https://cplusplus.github.io/CWG/issues/2258.html">2258</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Storage deallocation during period of destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2259">
     <td><a href="https://cplusplus.github.io/CWG/issues/2259.html">2259</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>C++17</td>
     <td>Unclear context describing ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2260">
     <td><a href="https://cplusplus.github.io/CWG/issues/2260.html">2260</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD5</td>
     <td>Explicit specializations of deleted member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2261">
     <td><a href="https://cplusplus.github.io/CWG/issues/2261.html">2261</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>extension</td>
     <td>Explicit instantiation of in-class <TT>friend</TT> definition</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2262">
     <td><a href="https://cplusplus.github.io/CWG/issues/2262.html">2262</a></td>
+    <td>[<a href="https://wg21.link/dcl.asm">dcl.asm</a>]</td>
     <td>C++17</td>
     <td>Attributes for <I>asm-definition</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2263">
     <td><a href="https://cplusplus.github.io/CWG/issues/2263.html">2263</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Default argument instantiation for <TT>friend</TT>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2264">
     <td><a href="https://cplusplus.github.io/CWG/issues/2264.html">2264</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Memberwise copying with indeterminate value</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2265">
     <td><a href="https://cplusplus.github.io/CWG/issues/2265.html">2265</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Delayed pack expansion and member redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2266">
     <td><a href="https://cplusplus.github.io/CWG/issues/2266.html">2266</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD5</td>
     <td>Has dependent type vs is type-dependent</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2267">
     <td><a href="https://cplusplus.github.io/CWG/issues/2267.html">2267</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD5</td>
     <td>Copy-initialization of temporary in reference direct-initialization</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2268">
     <td><a href="https://cplusplus.github.io/CWG/issues/2268.html">2268</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++17</td>
     <td>Unions with mutable members in constant expressions revisited</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2269">
     <td><a href="https://cplusplus.github.io/CWG/issues/2269.html">2269</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>dup</td>
     <td>Additional recursive references in aggregate DMIs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2270">
     <td><a href="https://cplusplus.github.io/CWG/issues/2270.html">2270</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Non-inline functions and explicit instantiation declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2271">
     <td><a href="https://cplusplus.github.io/CWG/issues/2271.html">2271</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++17</td>
     <td>Aliasing <TT>this</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2272">
     <td><a href="https://cplusplus.github.io/CWG/issues/2272.html">2272</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Implicit initialization of aggregate members of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2273">
     <td><a href="https://cplusplus.github.io/CWG/issues/2273.html">2273</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD5</td>
     <td>Inheriting constructors vs implicit default constructor</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="2274">
     <td><a href="https://cplusplus.github.io/CWG/issues/2274.html">2274</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>NAD</td>
     <td>Generic lambda capture vs constexpr if</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2275">
     <td><a href="https://cplusplus.github.io/CWG/issues/2275.html">2275</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>drafting</td>
     <td>Type-dependence of function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2276">
     <td><a href="https://cplusplus.github.io/CWG/issues/2276.html">2276</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++17</td>
     <td>Dependent <TT>noexcept</TT> and function type-dependence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2277">
     <td><a href="https://cplusplus.github.io/CWG/issues/2277.html">2277</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD5</td>
     <td>Ambiguity inheriting constructors with default arguments</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2278">
     <td><a href="https://cplusplus.github.io/CWG/issues/2278.html">2278</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Copy elision in constant expressions reconsidered</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2279">
     <td><a href="https://cplusplus.github.io/CWG/issues/2279.html">2279</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Multiple <I>attribute-specifier</I>s in one <I>attribute-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2280">
     <td><a href="https://cplusplus.github.io/CWG/issues/2280.html">2280</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++20</td>
     <td>Matching a usual deallocation function with placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2281">
     <td><a href="https://cplusplus.github.io/CWG/issues/2281.html">2281</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>drafting</td>
     <td>Consistency of aligned <TT>operator delete</TT> replacement</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2282">
     <td><a href="https://cplusplus.github.io/CWG/issues/2282.html">2282</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++20</td>
     <td>Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2283">
     <td><a href="https://cplusplus.github.io/CWG/issues/2283.html">2283</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD7</td>
     <td>Missing complete type requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2284">
     <td><a href="https://cplusplus.github.io/CWG/issues/2284.html">2284</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Sequencing of <I>braced-init-list</I> arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2285">
     <td><a href="https://cplusplus.github.io/CWG/issues/2285.html">2285</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Issues with structured bindings</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2286">
     <td><a href="https://cplusplus.github.io/CWG/issues/2286.html">2286</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>NAD</td>
     <td>Assignment evaluation order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2287">
     <td><a href="https://cplusplus.github.io/CWG/issues/2287.html">2287</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD5</td>
     <td>Pointer-interconvertibility in non-standard-layout unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2288">
     <td><a href="https://cplusplus.github.io/CWG/issues/2288.html">2288</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Contradictory optionality in <I>simple-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2289">
     <td><a href="https://cplusplus.github.io/CWG/issues/2289.html">2289</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD5</td>
     <td>Uniqueness of structured binding names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2290">
     <td><a href="https://cplusplus.github.io/CWG/issues/2290.html">2290</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs">over.match.funcs</a>]</td>
     <td>CD5</td>
     <td>Unclear specification for overload resolution and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2291">
     <td><a href="https://cplusplus.github.io/CWG/issues/2291.html">2291</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>dup</td>
     <td>Implicit conversion sequences in non-call contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2292">
     <td><a href="https://cplusplus.github.io/CWG/issues/2292.html">2292</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD5</td>
     <td><I>simple-template-id</I> is ambiguous between <I>class-name</I> and <I>type-name</I></td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2293">
     <td><a href="https://cplusplus.github.io/CWG/issues/2293.html">2293</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD5</td>
     <td>Requirements for <I>simple-template-id</I> used as a <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2294">
     <td><a href="https://cplusplus.github.io/CWG/issues/2294.html">2294</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD5</td>
     <td>Dependent <TT>auto</TT> static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2295">
     <td><a href="https://cplusplus.github.io/CWG/issues/2295.html">2295</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD5</td>
     <td>Aggregates with deleted defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2296">
     <td><a href="https://cplusplus.github.io/CWG/issues/2296.html">2296</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Are default argument instantiation failures in the &#8220;immediate context&#8221;?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2297">
     <td><a href="https://cplusplus.github.io/CWG/issues/2297.html">2297</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>open</td>
     <td>Unclear specification of atomic operations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2298">
     <td><a href="https://cplusplus.github.io/CWG/issues/2298.html">2298</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>open</td>
     <td>Actions and expression evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2299">
     <td><a href="https://cplusplus.github.io/CWG/issues/2299.html">2299</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD5</td>
     <td><TT>constexpr</TT> vararg functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2300">
     <td><a href="https://cplusplus.github.io/CWG/issues/2300.html">2300</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Lambdas in multiple definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2301">
     <td><a href="https://cplusplus.github.io/CWG/issues/2301.html">2301</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Value-initialization and constexpr constructor evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2302">
     <td><a href="https://cplusplus.github.io/CWG/issues/2302.html">2302</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>NAD</td>
     <td>Address comparison between different member subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2303">
     <td><a href="https://cplusplus.github.io/CWG/issues/2303.html">2303</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD5</td>
     <td>Partial ordering and recursive variadic inheritance</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2304">
     <td><a href="https://cplusplus.github.io/CWG/issues/2304.html">2304</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>NAD</td>
     <td>Incomplete type vs overload resolution</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="2305">
     <td><a href="https://cplusplus.github.io/CWG/issues/2305.html">2305</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Explicit instantiation of constexpr or inline variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2306">
     <td><a href="https://cplusplus.github.io/CWG/issues/2306.html">2306</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td>Nested friend templates of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2307">
     <td><a href="https://cplusplus.github.io/CWG/issues/2307.html">2307</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD5</td>
     <td>Unclear definition of &#8220;equivalent to a nontype template parameter&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2308">
     <td><a href="https://cplusplus.github.io/CWG/issues/2308.html">2308</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>NAD</td>
     <td>Structured bindings and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2309">
     <td><a href="https://cplusplus.github.io/CWG/issues/2309.html">2309</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD5</td>
     <td>Restrictions on nested statements within <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2310">
     <td><a href="https://cplusplus.github.io/CWG/issues/2310.html">2310</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD5</td>
     <td>Type completeness and derived-to-base pointer conversions</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr class="open" id="2311">
     <td><a href="https://cplusplus.github.io/CWG/issues/2311.html">2311</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>open</td>
     <td>Missed case for guaranteed copy elision</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2312">
     <td><a href="https://cplusplus.github.io/CWG/issues/2312.html">2312</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD6</td>
     <td>Structured bindings and <TT>mutable</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2313">
     <td><a href="https://cplusplus.github.io/CWG/issues/2313.html">2313</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Redeclaration of structured binding reference variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2314">
     <td><a href="https://cplusplus.github.io/CWG/issues/2314.html">2314</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>dup</td>
     <td>Structured bindings and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2315">
     <td><a href="https://cplusplus.github.io/CWG/issues/2315.html">2315</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD5</td>
     <td>What is the &#8220;corresponding special member&#8221; of a variant member?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2316">
     <td><a href="https://cplusplus.github.io/CWG/issues/2316.html">2316</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>drafting</td>
     <td>Simplifying class conversions in conditional expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2317">
     <td><a href="https://cplusplus.github.io/CWG/issues/2317.html">2317</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD5</td>
     <td>Self-referential default member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2318">
     <td><a href="https://cplusplus.github.io/CWG/issues/2318.html">2318</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD5</td>
     <td>Nondeduced contexts in deduction from a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2319">
     <td><a href="https://cplusplus.github.io/CWG/issues/2319.html">2319</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>drafting</td>
     <td>Nested brace initialization from same type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2320">
     <td><a href="https://cplusplus.github.io/CWG/issues/2320.html">2320</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>extension</td>
     <td><TT>constexpr if</TT> and boolean conversions</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2321">
     <td><a href="https://cplusplus.github.io/CWG/issues/2321.html">2321</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD5</td>
     <td>Conditional operator and cv-qualified class prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2322">
     <td><a href="https://cplusplus.github.io/CWG/issues/2322.html">2322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD5</td>
     <td>Substitution failure and lexical order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2323">
     <td><a href="https://cplusplus.github.io/CWG/issues/2323.html">2323</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++20</td>
     <td>Expunge POD</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2324">
     <td><a href="https://cplusplus.github.io/CWG/issues/2324.html">2324</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td>Size of base class subobject</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2325">
     <td><a href="https://cplusplus.github.io/CWG/issues/2325.html">2325</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td><TT>std::launder</TT> and reuse of character buffers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2326">
     <td><a href="https://cplusplus.github.io/CWG/issues/2326.html">2326</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>dup</td>
     <td>Type deduction with initializer list containing ambiguous functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2327">
     <td><a href="https://cplusplus.github.io/CWG/issues/2327.html">2327</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>drafting</td>
     <td>Copy elision for direct-initialization with a conversion function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2328">
     <td><a href="https://cplusplus.github.io/CWG/issues/2328.html">2328</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>drafting</td>
     <td>Unclear presentation style of template argument deduction rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2329">
     <td><a href="https://cplusplus.github.io/CWG/issues/2329.html">2329</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Virtual base classes and generated assignment operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2330">
     <td><a href="https://cplusplus.github.io/CWG/issues/2330.html">2330</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD5</td>
     <td>Missing references to variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2331">
     <td><a href="https://cplusplus.github.io/CWG/issues/2331.html">2331</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD6</td>
     <td>Redundancy in description of class scope</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2332">
     <td><a href="https://cplusplus.github.io/CWG/issues/2332.html">2332</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD5</td>
     <td><I>template-name</I> as <I>simple-type-name</I> vs injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2333">
     <td><a href="https://cplusplus.github.io/CWG/issues/2333.html">2333</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Escape sequences in UTF-8 character literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2334">
     <td><a href="https://cplusplus.github.io/CWG/issues/2334.html">2334</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Creation of objects by <TT>typeid</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2335">
     <td><a href="https://cplusplus.github.io/CWG/issues/2335.html">2335</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>drafting</td>
     <td>Deduced return types vs member types</td>
     <td align="center">
@@ -13849,1308 +16142,1526 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2336">
     <td><a href="https://cplusplus.github.io/CWG/issues/2336.html">2336</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD5</td>
     <td>Destructor characteristics vs potentially-constructed subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2337">
     <td><a href="https://cplusplus.github.io/CWG/issues/2337.html">2337</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Incorrect implication of logic ladder for conversion sequence tiebreakers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2338">
     <td><a href="https://cplusplus.github.io/CWG/issues/2338.html">2338</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD5</td>
     <td>Undefined behavior converting to short enums with fixed underlying types</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2339">
     <td><a href="https://cplusplus.github.io/CWG/issues/2339.html">2339</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Underspecified template arguments in structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2340">
     <td><a href="https://cplusplus.github.io/CWG/issues/2340.html">2340</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>open</td>
     <td>Reference collapsing and structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2341">
     <td><a href="https://cplusplus.github.io/CWG/issues/2341.html">2341</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD5</td>
     <td>Structured bindings with static storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2342">
     <td><a href="https://cplusplus.github.io/CWG/issues/2342.html">2342</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD5</td>
     <td>Reference <TT>reinterpret_cast</TT> and pointer-interconvertibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2343">
     <td><a href="https://cplusplus.github.io/CWG/issues/2343.html">2343</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++20</td>
     <td><TT>void*</TT> non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2344">
     <td><a href="https://cplusplus.github.io/CWG/issues/2344.html">2344</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>NAD</td>
     <td>Redeclaration of names in <I>init-statement</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2345">
     <td><a href="https://cplusplus.github.io/CWG/issues/2345.html">2345</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>CD5</td>
     <td>Jumping across initializers in <I>init-statement</I>s and <I>condition</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2346">
     <td><a href="https://cplusplus.github.io/CWG/issues/2346.html">2346</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD5</td>
     <td>Local variables in default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2347">
     <td><a href="https://cplusplus.github.io/CWG/issues/2347.html">2347</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++20</td>
     <td>Passing short scoped enumerations to ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2348">
     <td><a href="https://cplusplus.github.io/CWG/issues/2348.html">2348</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>NAD</td>
     <td>Non-templated <TT>constexpr if</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2349">
     <td><a href="https://cplusplus.github.io/CWG/issues/2349.html">2349</a></td>
+    <td>[<a href="https://wg21.link/stmt">stmt</a>]</td>
     <td>NAD</td>
     <td>Class/enumeration names vs conditions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2350">
     <td><a href="https://cplusplus.github.io/CWG/issues/2350.html">2350</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>NAD</td>
     <td>Forwarding references and deduction guides</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2351">
     <td><a href="https://cplusplus.github.io/CWG/issues/2351.html">2351</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td><TT>void{}</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2352">
     <td><a href="https://cplusplus.github.io/CWG/issues/2352.html">2352</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD5</td>
     <td>Similar types and reference binding</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="2353">
     <td><a href="https://cplusplus.github.io/CWG/issues/2353.html">2353</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Potential results of a member access expression for a static data member</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2354">
     <td><a href="https://cplusplus.github.io/CWG/issues/2354.html">2354</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD5</td>
     <td>Extended alignment and object representation</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr id="2355">
     <td><a href="https://cplusplus.github.io/CWG/issues/2355.html">2355</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD6</td>
     <td>Deducing <I>noexcept-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2356">
     <td><a href="https://cplusplus.github.io/CWG/issues/2356.html">2356</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs">over.match.funcs</a>]</td>
     <td>CD5</td>
     <td>Base class copy and move constructors should not be inherited</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2357">
     <td><a href="https://cplusplus.github.io/CWG/issues/2357.html">2357</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Lookup in member function declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2358">
     <td><a href="https://cplusplus.github.io/CWG/issues/2358.html">2358</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td>Explicit capture of value</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2359">
     <td><a href="https://cplusplus.github.io/CWG/issues/2359.html">2359</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD5</td>
     <td>Unintended copy initialization with designated initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2360">
     <td><a href="https://cplusplus.github.io/CWG/issues/2360.html">2360</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.unused">dcl.attr.unused</a>]</td>
     <td>CD5</td>
     <td><TT>[[maybe_unused]]</TT> and structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2361">
     <td><a href="https://cplusplus.github.io/CWG/issues/2361.html">2361</a></td>
+    <td>[<a href="https://wg21.link/csetjmp.syn">csetjmp.syn</a>]</td>
     <td>open</td>
     <td>Unclear description of <TT>longjmp</TT> undefined behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2362">
     <td><a href="https://cplusplus.github.io/CWG/issues/2362.html">2362</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td><TT>__func__</TT> should be <TT>constexpr</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2363">
     <td><a href="https://cplusplus.github.io/CWG/issues/2363.html">2363</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Opaque enumeration friend declarations</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2364">
     <td><a href="https://cplusplus.github.io/CWG/issues/2364.html">2364</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Constant expressions, aggregate initialization, and modifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2365">
     <td><a href="https://cplusplus.github.io/CWG/issues/2365.html">2365</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD5</td>
     <td>Confusing specification for <TT>dynamic_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2366">
     <td><a href="https://cplusplus.github.io/CWG/issues/2366.html">2366</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD5</td>
     <td>Can default initialization be constant initialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2367">
     <td><a href="https://cplusplus.github.io/CWG/issues/2367.html">2367</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Lambdas in default arguments vs the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2368">
     <td><a href="https://cplusplus.github.io/CWG/issues/2368.html">2368</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Differences in relational and three-way constant comparisons</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2369">
     <td><a href="https://cplusplus.github.io/CWG/issues/2369.html">2369</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Ordering between constraints and substitution</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2370">
     <td><a href="https://cplusplus.github.io/CWG/issues/2370.html">2370</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td><TT>friend</TT> declarations of namespace-scope functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2371">
     <td><a href="https://cplusplus.github.io/CWG/issues/2371.html">2371</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>CD5</td>
     <td>Use of the English term &#8220;attributes&#8221; is confusing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2372">
     <td><a href="https://cplusplus.github.io/CWG/issues/2372.html">2372</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD5</td>
     <td>Incorrect matching rules for block-scope <TT>extern</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2373">
     <td><a href="https://cplusplus.github.io/CWG/issues/2373.html">2373</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD5</td>
     <td>Incorrect handling of static member function templates in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2374">
     <td><a href="https://cplusplus.github.io/CWG/issues/2374.html">2374</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++20</td>
     <td>Overly permissive specification of <TT>enum</TT> direct-list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2375">
     <td><a href="https://cplusplus.github.io/CWG/issues/2375.html">2375</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>Multiple redeclarations of <TT>constexpr</TT> static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2376">
     <td><a href="https://cplusplus.github.io/CWG/issues/2376.html">2376</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD5</td>
     <td>Class template argument deduction with array declarator</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2377">
     <td><a href="https://cplusplus.github.io/CWG/issues/2377.html">2377</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>NAD</td>
     <td>Explicit copy constructor vs function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2378">
     <td><a href="https://cplusplus.github.io/CWG/issues/2378.html">2378</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++20</td>
     <td>Inconsistent grammar for reference <I>init-capture</I> of pack</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2379">
     <td><a href="https://cplusplus.github.io/CWG/issues/2379.html">2379</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Missing prohibition against <TT>constexpr</TT> in <TT>friend</TT> declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2380">
     <td><a href="https://cplusplus.github.io/CWG/issues/2380.html">2380</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td><I>capture-default</I> makes too many references odr-usable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2381">
     <td><a href="https://cplusplus.github.io/CWG/issues/2381.html">2381</a></td>
+    <td>[<a href="https://wg21.link/expr.type">expr.type</a>]</td>
     <td>CD5</td>
     <td>Composite pointer type of pointers to plain and noexcept member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2382">
     <td><a href="https://cplusplus.github.io/CWG/issues/2382.html">2382</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Array allocation overhead for non-allocating placement <TT>new</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2383">
     <td><a href="https://cplusplus.github.io/CWG/issues/2383.html">2383</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Variadic member functions of variadic class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2384">
     <td><a href="https://cplusplus.github.io/CWG/issues/2384.html">2384</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD5</td>
     <td>Conversion function templates and qualification conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2385">
     <td><a href="https://cplusplus.github.io/CWG/issues/2385.html">2385</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD5</td>
     <td>Lookup for <I>conversion-function-id</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2386">
     <td><a href="https://cplusplus.github.io/CWG/issues/2386.html">2386</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td><T>tuple_size</T> requirements for structured binding</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2387">
     <td><a href="https://cplusplus.github.io/CWG/issues/2387.html">2387</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD5</td>
     <td>Linkage of const-qualified variable template</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2388">
     <td><a href="https://cplusplus.github.io/CWG/issues/2388.html">2388</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Applicability of <I>contract-attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2389">
     <td><a href="https://cplusplus.github.io/CWG/issues/2389.html">2389</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD6</td>
     <td>Agreement of deduced and explicitly-specified variable types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2390">
     <td><a href="https://cplusplus.github.io/CWG/issues/2390.html">2390</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD5</td>
     <td>Is the argument of <TT>__has_cpp_attribute</TT> macro-expanded?</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="2391">
     <td><a href="https://cplusplus.github.io/CWG/issues/2391.html">2391</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>dup</td>
     <td>Additional template parameters following pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2392">
     <td><a href="https://cplusplus.github.io/CWG/issues/2392.html">2392</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td><I>new-expression</I> size check and constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2393">
     <td><a href="https://cplusplus.github.io/CWG/issues/2393.html">2393</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>NAD</td>
     <td>Pseudo-destructors and object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2394">
     <td><a href="https://cplusplus.github.io/CWG/issues/2394.html">2394</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>CD5</td>
     <td>Const-default-constructible for members</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr class="open" id="2395">
     <td><a href="https://cplusplus.github.io/CWG/issues/2395.html">2395</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>Parameters following a pack expansion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2396">
     <td><a href="https://cplusplus.github.io/CWG/issues/2396.html">2396</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD6</td>
     <td>Lookup of names in complex <I>conversion-type-id</I>s</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2397">
     <td><a href="https://cplusplus.github.io/CWG/issues/2397.html">2397</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD6</td>
     <td><TT>auto</TT> specifier for pointers and references to arrays</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2398">
     <td><a href="https://cplusplus.github.io/CWG/issues/2398.html">2398</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>drafting</td>
     <td>Template template parameter matching and deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2399">
     <td><a href="https://cplusplus.github.io/CWG/issues/2399.html">2399</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD5</td>
     <td>Unclear referent of &#8220;expression&#8221; in <I>assignment-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2400">
     <td><a href="https://cplusplus.github.io/CWG/issues/2400.html">2400</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Constexpr virtual functions and temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2401">
     <td><a href="https://cplusplus.github.io/CWG/issues/2401.html">2401</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++20</td>
     <td>Array decay vs prohibition of subobject non-type arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2402">
     <td><a href="https://cplusplus.github.io/CWG/issues/2402.html">2402</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>When is the restriction to a single <I>c-char</I> in a Unicode literal enforced?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2403">
     <td><a href="https://cplusplus.github.io/CWG/issues/2403.html">2403</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>drafting</td>
     <td>Temporary materialization and base/member initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2404">
     <td><a href="https://cplusplus.github.io/CWG/issues/2404.html">2404</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td><TT>[[no_unique_address]]</TT> and allocation order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2405">
     <td><a href="https://cplusplus.github.io/CWG/issues/2405.html">2405</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD6</td>
     <td>Additional type-dependent expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2406">
     <td><a href="https://cplusplus.github.io/CWG/issues/2406.html">2406</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.fallthrough">dcl.attr.fallthrough</a>]</td>
     <td>CD5</td>
     <td><TT>[[fallthrough]]</TT> attribute and iteration statements</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="2407">
     <td><a href="https://cplusplus.github.io/CWG/issues/2407.html">2407</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>C++23</td>
     <td>Missing entry in Annex C for defaulted comparison operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2408">
     <td><a href="https://cplusplus.github.io/CWG/issues/2408.html">2408</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Temporaries and previously-initialized elements in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2409">
     <td><a href="https://cplusplus.github.io/CWG/issues/2409.html">2409</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>drafting</td>
     <td>Explicit specializations of constexpr static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2410">
     <td><a href="https://cplusplus.github.io/CWG/issues/2410.html">2410</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++23</td>
     <td>Implicit calls of immediate functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2411">
     <td><a href="https://cplusplus.github.io/CWG/issues/2411.html">2411</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>C++20</td>
     <td>Comparison of pointers to members in template non-type arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2412">
     <td><a href="https://cplusplus.github.io/CWG/issues/2412.html">2412</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>review</td>
     <td>SFINAE vs undeduced placeholder type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2413">
     <td><a href="https://cplusplus.github.io/CWG/issues/2413.html">2413</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD6</td>
     <td><TT>typename</TT> in <I>conversion-function-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2414">
     <td><a href="https://cplusplus.github.io/CWG/issues/2414.html">2414</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>C++20</td>
     <td>Unclear results if both member and friend <TT>operator&lt;=&gt;</TT> are declared</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2415">
     <td><a href="https://cplusplus.github.io/CWG/issues/2415.html">2415</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td><I>using-declaration</I>s vs copy assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2416">
     <td><a href="https://cplusplus.github.io/CWG/issues/2416.html">2416</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++20</td>
     <td>Explicit specializations vs <TT>constexpr</TT> and <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2417">
     <td><a href="https://cplusplus.github.io/CWG/issues/2417.html">2417</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>open</td>
     <td>Explicit instantiation and exception specifications</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2418">
     <td><a href="https://cplusplus.github.io/CWG/issues/2418.html">2418</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Missing cases in definition of &#8220;usable in constant expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2419">
     <td><a href="https://cplusplus.github.io/CWG/issues/2419.html">2419</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>C++20</td>
     <td>Loss of generality treating pointers to objects as one-element arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2420">
     <td><a href="https://cplusplus.github.io/CWG/issues/2420.html">2420</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications in explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2421">
     <td><a href="https://cplusplus.github.io/CWG/issues/2421.html">2421</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicit instantiation of constrained member functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2422">
     <td><a href="https://cplusplus.github.io/CWG/issues/2422.html">2422</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>C++20</td>
     <td>Incorrect grammar for <I>deduction-guide</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2423">
     <td><a href="https://cplusplus.github.io/CWG/issues/2423.html">2423</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>NAD</td>
     <td>Typedefs, names, and entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2424">
     <td><a href="https://cplusplus.github.io/CWG/issues/2424.html">2424</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++20</td>
     <td><TT>constexpr</TT> initialization requirements for variant members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2425">
     <td><a href="https://cplusplus.github.io/CWG/issues/2425.html">2425</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>open</td>
     <td>Confusing wording for deduction from a type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2426">
     <td><a href="https://cplusplus.github.io/CWG/issues/2426.html">2426</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++20</td>
     <td>Reference to destructor that cannot be invoked</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2427">
     <td><a href="https://cplusplus.github.io/CWG/issues/2427.html">2427</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>C++20</td>
     <td>Deprecation of volatile operands and unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2428">
     <td><a href="https://cplusplus.github.io/CWG/issues/2428.html">2428</a></td>
+    <td>[<a href="https://wg21.link/temp.concept">temp.concept</a>]</td>
     <td>C++23</td>
     <td>Deprecating a concept</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2429">
     <td><a href="https://cplusplus.github.io/CWG/issues/2429.html">2429</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>C++20</td>
     <td>Initialization of <TT>thread_local</TT> variables referenced by lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2430">
     <td><a href="https://cplusplus.github.io/CWG/issues/2430.html">2430</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++20</td>
     <td>Completeness of return and parameter types of member functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2431">
     <td><a href="https://cplusplus.github.io/CWG/issues/2431.html">2431</a></td>
+    <td>[<a href="https://wg21.link/basic.exec">basic.exec</a>]</td>
     <td>C++20</td>
     <td>Full-expressions and temporaries bound to references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2432">
     <td><a href="https://cplusplus.github.io/CWG/issues/2432.html">2432</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++20</td>
     <td>Return types for defaulted <TT>&lt;=&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2433">
     <td><a href="https://cplusplus.github.io/CWG/issues/2433.html">2433</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++20</td>
     <td>Variable templates in the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2434">
     <td><a href="https://cplusplus.github.io/CWG/issues/2434.html">2434</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>review</td>
     <td>Mandatory copy elision vs non-class objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2435">
     <td><a href="https://cplusplus.github.io/CWG/issues/2435.html">2435</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>open</td>
     <td>Alias template specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2436">
     <td><a href="https://cplusplus.github.io/CWG/issues/2436.html">2436</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++20</td>
     <td>Copy semantics of coroutine parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2437">
     <td><a href="https://cplusplus.github.io/CWG/issues/2437.html">2437</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++20</td>
     <td>Conversion of <TT>std::strong_ordering</TT> in a defaulted <TT>operator&lt;=&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2438">
     <td><a href="https://cplusplus.github.io/CWG/issues/2438.html">2438</a></td>
+    <td>[<a href="https://wg21.link/conv.qual">conv.qual</a>]</td>
     <td>open</td>
     <td>Problems in the specification of qualification conversions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2439">
     <td><a href="https://cplusplus.github.io/CWG/issues/2439.html">2439</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Undefined term in definition of &#8220;usable in constant expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2440">
     <td><a href="https://cplusplus.github.io/CWG/issues/2440.html">2440</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Allocation in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2441">
     <td><a href="https://cplusplus.github.io/CWG/issues/2441.html">2441</a></td>
+    <td>[<a href="https://wg21.link/dcl.inline">dcl.inline</a>]</td>
     <td>C++20</td>
     <td>Inline function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2442">
     <td><a href="https://cplusplus.github.io/CWG/issues/2442.html">2442</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>C++20</td>
     <td>Incorrect requirement for default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2443">
     <td><a href="https://cplusplus.github.io/CWG/issues/2443.html">2443</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>C++23</td>
     <td>Meaningless template exports</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2444">
     <td><a href="https://cplusplus.github.io/CWG/issues/2444.html">2444</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>drafting</td>
     <td>Constant expressions in initialization odr-use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2445">
     <td><a href="https://cplusplus.github.io/CWG/issues/2445.html">2445</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++20</td>
     <td>Partial ordering with rewritten candidates</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2446">
     <td><a href="https://cplusplus.github.io/CWG/issues/2446.html">2446</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>C++20</td>
     <td>Questionable type-dependency of <I>concept-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2447">
     <td><a href="https://cplusplus.github.io/CWG/issues/2447.html">2447</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++20</td>
     <td>Unintended description of abbreviated function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2448">
     <td><a href="https://cplusplus.github.io/CWG/issues/2448.html">2448</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of arithmetic types and deprecation of volatile</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2449">
     <td><a href="https://cplusplus.github.io/CWG/issues/2449.html">2449</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>extension</td>
     <td>Thunks as an implementation technique for pointers to virtual functions</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD7</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++23</td>
     <td><I>promise</I><TT>.unhandled_exception()</TT> and final suspend point</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2452">
     <td><a href="https://cplusplus.github.io/CWG/issues/2452.html">2452</a></td>
+    <td>[<a href="https://wg21.link/stmt.return.coroutine">stmt.return.coroutine</a>]</td>
     <td>CD6</td>
     <td>Flowing off the end of a coroutine</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2453">
     <td><a href="https://cplusplus.github.io/CWG/issues/2453.html">2453</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>NAD</td>
     <td>Deduced return types and coroutine lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2454">
     <td><a href="https://cplusplus.github.io/CWG/issues/2454.html">2454</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>NAD</td>
     <td>Tail recursion and coroutine symmetric transfer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2455">
     <td><a href="https://cplusplus.github.io/CWG/issues/2455.html">2455</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD6</td>
     <td>Concatenation of string literals vs translation phases 5 and 6</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2456">
     <td><a href="https://cplusplus.github.io/CWG/issues/2456.html">2456</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Viable user-defined conversions in converted constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2457">
     <td><a href="https://cplusplus.github.io/CWG/issues/2457.html">2457</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Unexpanded parameter packs don't make a function type dependent</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2458">
     <td><a href="https://cplusplus.github.io/CWG/issues/2458.html">2458</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD7</td>
     <td>Template parameter initialization</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>C language linkage and constrained non-template friends</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2461">
     <td><a href="https://cplusplus.github.io/CWG/issues/2461.html">2461</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD6</td>
     <td>Diagnosing non-<TT>bool</TT> type constraints</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2462">
     <td><a href="https://cplusplus.github.io/CWG/issues/2462.html">2462</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>open</td>
     <td>Problems with the omission of the <TT>typename</TT> keyword</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2463">
     <td><a href="https://cplusplus.github.io/CWG/issues/2463.html">2463</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Trivial copyability and unions with non-trivial members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2464">
     <td><a href="https://cplusplus.github.io/CWG/issues/2464.html">2464</a></td>
+    <td>[<a href="https://wg21.link/ptr.launder">ptr.launder</a>]</td>
     <td>CD6</td>
     <td>Constexpr launder and unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2465">
     <td><a href="https://cplusplus.github.io/CWG/issues/2465.html">2465</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD6</td>
     <td>Coroutine parameters passed to a promise constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2466">
     <td><a href="https://cplusplus.github.io/CWG/issues/2466.html">2466</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>CD6</td>
     <td><TT>co_await</TT> should be a single evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2467">
     <td><a href="https://cplusplus.github.io/CWG/issues/2467.html">2467</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>drafting</td>
     <td>CTAD for alias templates and the deducible check</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2468">
     <td><a href="https://cplusplus.github.io/CWG/issues/2468.html">2468</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>open</td>
     <td>Omission of the <TT>typename</TT> keyword in a member template parameter list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2469">
     <td><a href="https://cplusplus.github.io/CWG/issues/2469.html">2469</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td>Implicit object creation vs constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2470">
     <td><a href="https://cplusplus.github.io/CWG/issues/2470.html">2470</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD6</td>
     <td>Multiple array objects providing storage for one object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2471">
     <td><a href="https://cplusplus.github.io/CWG/issues/2471.html">2471</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>drafting</td>
     <td>Nested class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2472">
     <td><a href="https://cplusplus.github.io/CWG/issues/2472.html">2472</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>NAD</td>
     <td>Value categories in <I>await-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2473">
     <td><a href="https://cplusplus.github.io/CWG/issues/2473.html">2473</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.dtor">expr.prim.id.dtor</a>]</td>
     <td>open</td>
     <td>Parentheses in pseudo-destructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2474">
     <td><a href="https://cplusplus.github.io/CWG/issues/2474.html">2474</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification and deletion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2475">
     <td><a href="https://cplusplus.github.io/CWG/issues/2475.html">2475</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++23</td>
     <td>Object declarations of type <I>cv</I> <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>CD7</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2477">
     <td><a href="https://cplusplus.github.io/CWG/issues/2477.html">2477</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD6</td>
     <td>Defaulted vs deleted copy constructors/assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2478">
     <td><a href="https://cplusplus.github.io/CWG/issues/2478.html">2478</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++23</td>
     <td>Properties of explicit specializations of implicitly-instantiated class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2479">
     <td><a href="https://cplusplus.github.io/CWG/issues/2479.html">2479</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD6</td>
     <td>Missing specifications for <TT>consteval</TT> and <TT>constinit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2480">
     <td><a href="https://cplusplus.github.io/CWG/issues/2480.html">2480</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>drafting</td>
     <td>Lookup for enumerators in modules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2481">
     <td><a href="https://cplusplus.github.io/CWG/issues/2481.html">2481</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of temporary to which a reference is bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2482">
     <td><a href="https://cplusplus.github.io/CWG/issues/2482.html">2482</a></td>
+    <td>[<a href="https://wg21.link/bit.cast">bit.cast</a>]</td>
     <td>CD6</td>
     <td><TT>bit_cast</TT> and indeterminate values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2483">
     <td><a href="https://cplusplus.github.io/CWG/issues/2483.html">2483</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++23</td>
     <td>Language linkage of static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2484">
     <td><a href="https://cplusplus.github.io/CWG/issues/2484.html">2484</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD6</td>
     <td><TT>char8_t</TT> and <TT>char16_t</TT> in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2485">
     <td><a href="https://cplusplus.github.io/CWG/issues/2485.html">2485</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD7</td>
     <td>Bit-fields in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2486">
     <td><a href="https://cplusplus.github.io/CWG/issues/2486.html">2486</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD6</td>
     <td>Call to <TT>noexcept</TT> function via <TT>noexcept(false)</TT> pointer/lvalue</td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr class="open" id="2487">
     <td><a href="https://cplusplus.github.io/CWG/issues/2487.html">2487</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>drafting</td>
     <td>Type dependence of function-style cast to incomplete array type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2488">
     <td><a href="https://cplusplus.github.io/CWG/issues/2488.html">2488</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Overloading virtual functions and functions with trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2489">
     <td><a href="https://cplusplus.github.io/CWG/issues/2489.html">2489</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>C++23</td>
     <td>Storage provided by array of <TT>char</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2490">
     <td><a href="https://cplusplus.github.io/CWG/issues/2490.html">2490</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD6</td>
     <td>Restrictions on destruction in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2491">
     <td><a href="https://cplusplus.github.io/CWG/issues/2491.html">2491</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD6</td>
     <td>Export of typedef after its first declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2492">
     <td><a href="https://cplusplus.github.io/CWG/issues/2492.html">2492</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Comparing user-defined conversion sequences in list-initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2493">
     <td><a href="https://cplusplus.github.io/CWG/issues/2493.html">2493</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>dup</td>
     <td><TT>auto</TT> as a <I>conversion-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2494">
     <td><a href="https://cplusplus.github.io/CWG/issues/2494.html">2494</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD6</td>
     <td>Multiple definitions of non-odr-used entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2495">
     <td><a href="https://cplusplus.github.io/CWG/issues/2495.html">2495</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Glvalue result of a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2496">
     <td><a href="https://cplusplus.github.io/CWG/issues/2496.html">2496</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD6</td>
     <td><I>ref-qualifier</I>s and virtual overriding</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr class="open" id="2497">
     <td><a href="https://cplusplus.github.io/CWG/issues/2497.html">2497</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Points of instantiation for constexpr function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2498">
     <td><a href="https://cplusplus.github.io/CWG/issues/2498.html">2498</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Partial specialization failure and the immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2499">
     <td><a href="https://cplusplus.github.io/CWG/issues/2499.html">2499</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD6</td>
     <td>Inconsistency in definition of pointer-interconvertibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2500">
     <td><a href="https://cplusplus.github.io/CWG/issues/2500.html">2500</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>extension</td>
     <td><TT>noexcept(false)</TT> functions and <TT>noexcept</TT> expressions</td>
     <td align="center">Extension</td>
   </tr>
   <tr class="open" id="2501">
     <td><a href="https://cplusplus.github.io/CWG/issues/2501.html">2501</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicit instantiation and trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2502">
     <td><a href="https://cplusplus.github.io/CWG/issues/2502.html">2502</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD6</td>
     <td>Unintended declaration conflicts in nested statement scopes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2503">
     <td><a href="https://cplusplus.github.io/CWG/issues/2503.html">2503</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id">expr.prim.id</a>]</td>
     <td>drafting</td>
     <td>Unclear relationship among name, qualified name, and unqualified name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor.init">class.inhctor.init</a>]</td>
     <td>CD7</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2505">
     <td><a href="https://cplusplus.github.io/CWG/issues/2505.html">2505</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>drafting</td>
     <td>Nested unnamed namespace of inline unnamed namespace</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2506">
     <td><a href="https://cplusplus.github.io/CWG/issues/2506.html">2506</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD6</td>
     <td>Structured bindings and array cv-qualifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2507">
     <td><a href="https://cplusplus.github.io/CWG/issues/2507.html">2507</a></td>
+    <td>[<a href="https://wg21.link/over.oper.general">over.oper.general</a>]</td>
     <td>CD6</td>
     <td>Default arguments for <TT>operator[]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2508">
     <td><a href="https://cplusplus.github.io/CWG/issues/2508.html">2508</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++23</td>
     <td>Restrictions on uses of template parameter names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2509">
     <td><a href="https://cplusplus.github.io/CWG/issues/2509.html">2509</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.general">expr.prim.lambda.general</a>]</td>
     <td>CD6</td>
     <td><I>decl-specifier-seq</I> in <I>lambda-specifiers</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2510">
     <td><a href="https://cplusplus.github.io/CWG/issues/2510.html">2510</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>NAD</td>
     <td><I>noexcept-specifier</I> of friend function vs class completeness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2511">
     <td><a href="https://cplusplus.github.io/CWG/issues/2511.html">2511</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD6</td>
     <td>cv-qualified bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2512">
     <td><a href="https://cplusplus.github.io/CWG/issues/2512.html">2512</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>NAD</td>
     <td><TT>typeid</TT> and incomplete class types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="2513">
     <td><a href="https://cplusplus.github.io/CWG/issues/2513.html">2513</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>open</td>
     <td>Ambiguity with <I>requires-clause</I> and <I>operator-function-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2514">
     <td><a href="https://cplusplus.github.io/CWG/issues/2514.html">2514</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Modifying const subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2515">
     <td><a href="https://cplusplus.github.io/CWG/issues/2515.html">2515</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Result of a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2516">
     <td><a href="https://cplusplus.github.io/CWG/issues/2516.html">2516</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++23</td>
     <td>Locus of <I>enum-specifier</I> or <I>opaque-enum-declaration</I></td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2517">
     <td><a href="https://cplusplus.github.io/CWG/issues/2517.html">2517</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.nested">expr.prim.req.nested</a>]</td>
     <td>C++23</td>
     <td>Useless restriction on use of parameter in <I>constraint-expression</I></td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2518">
     <td><a href="https://cplusplus.github.io/CWG/issues/2518.html">2518</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>C++23</td>
     <td>Conformance requirements and <TT>#error</TT>/<TT>#warning</TT></td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2519">
     <td><a href="https://cplusplus.github.io/CWG/issues/2519.html">2519</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>CD7</td>
     <td>Object representation of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2520">
     <td><a href="https://cplusplus.github.io/CWG/issues/2520.html">2520</a></td>
+    <td>[<a href="https://wg21.link/defns.signature.templ">defns.signature.templ</a>]</td>
     <td>C++23</td>
     <td>Template signature and default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2521">
     <td><a href="https://cplusplus.github.io/CWG/issues/2521.html">2521</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>C++23</td>
     <td>User-defined literals and reserved identifiers</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2522">
     <td><a href="https://cplusplus.github.io/CWG/issues/2522.html">2522</a></td>
+    <td>[<a href="https://wg21.link/cpp.concat">cpp.concat</a>]</td>
     <td>open</td>
     <td>Removing placemarker tokens and retention of whitespace</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2523">
     <td><a href="https://cplusplus.github.io/CWG/issues/2523.html">2523</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Undefined behavior via omitted destructor call in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2524">
     <td><a href="https://cplusplus.github.io/CWG/issues/2524.html">2524</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>NAD</td>
     <td>Distinguishing user-defined conversion sequences by <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2525">
     <td><a href="https://cplusplus.github.io/CWG/issues/2525.html">2525</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Incorrect definition of implicit conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2526">
     <td><a href="https://cplusplus.github.io/CWG/issues/2526.html">2526</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>C++23</td>
     <td>Relational comparison of <TT>void*</TT> pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2527">
     <td><a href="https://cplusplus.github.io/CWG/issues/2527.html">2527</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.nouniqueaddr">dcl.attr.nouniqueaddr</a>]</td>
     <td>NAD</td>
     <td>Non-class potentially-overlapping objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2528">
     <td><a href="https://cplusplus.github.io/CWG/issues/2528.html">2528</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>C++23</td>
     <td>Three-way comparison and the usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2529">
     <td><a href="https://cplusplus.github.io/CWG/issues/2529.html">2529</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Constant destruction of constexpr references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2530">
     <td><a href="https://cplusplus.github.io/CWG/issues/2530.html">2530</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td>Multiple definitions of enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD7</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2532">
     <td><a href="https://cplusplus.github.io/CWG/issues/2532.html">2532</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Kind of pointer value returned by <TT>new T[0]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD7</td>
     <td>Storage duration of implicitly created objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2534">
     <td><a href="https://cplusplus.github.io/CWG/issues/2534.html">2534</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Value category of pseudo-destructor expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2535">
     <td><a href="https://cplusplus.github.io/CWG/issues/2535.html">2535</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Type punning in class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2536">
     <td><a href="https://cplusplus.github.io/CWG/issues/2536.html">2536</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Partially initialized variables during constant initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2537">
     <td><a href="https://cplusplus.github.io/CWG/issues/2537.html">2537</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>drafting</td>
     <td>Overbroad grammar for <I>parameter-declaration</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2538">
     <td><a href="https://cplusplus.github.io/CWG/issues/2538.html">2538</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++23</td>
     <td>Can standard attributes be syntactically ignored?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2539">
     <td><a href="https://cplusplus.github.io/CWG/issues/2539.html">2539</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++23</td>
     <td>Three-way comparison requiring strong ordering for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2540">
     <td><a href="https://cplusplus.github.io/CWG/issues/2540.html">2540</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Unspecified interpretation of <I>numeric-escape-sequence</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2541">
     <td><a href="https://cplusplus.github.io/CWG/issues/2541.html">2541</a></td>
+    <td>[<a href="https://wg21.link/module.unit">module.unit</a>]</td>
     <td>open</td>
     <td>Linkage specifications, module purview, and module attachment</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2542">
     <td><a href="https://cplusplus.github.io/CWG/issues/2542.html">2542</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Is a closure type a structural type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2543">
     <td><a href="https://cplusplus.github.io/CWG/issues/2543.html">2543</a></td>
+    <td>[<a href="https://wg21.link/dcl.constinit">dcl.constinit</a>]</td>
     <td>C++23</td>
     <td><TT>constinit</TT> and optimized dynamic initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2544">
     <td><a href="https://cplusplus.github.io/CWG/issues/2544.html">2544</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>open</td>
     <td>Address of past-the-end of a potentially-overlapping subobject</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2545">
     <td><a href="https://cplusplus.github.io/CWG/issues/2545.html">2545</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Transparently replacing objects in constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
+    <td>[<a href="https://wg21.link/class.compare.secondary">class.compare.secondary</a>]</td>
     <td>CD7</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2548">
     <td><a href="https://cplusplus.github.io/CWG/issues/2548.html">2548</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Array prvalues and additive operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2549">
     <td><a href="https://cplusplus.github.io/CWG/issues/2549.html">2549</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD7</td>
     <td>Implicitly moving the operand of a <I>throw-expression</I> in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2550">
     <td><a href="https://cplusplus.github.io/CWG/issues/2550.html">2550</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD7</td>
     <td>Type "reference to <I>cv</I> <TT>void</TT>" outside of a declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2551">
     <td><a href="https://cplusplus.github.io/CWG/issues/2551.html">2551</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>"Refers to allocated storage" has no meaning</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2552">
     <td><a href="https://cplusplus.github.io/CWG/issues/2552.html">2552</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Constant evaluation of non-defining variable declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2553">
     <td><a href="https://cplusplus.github.io/CWG/issues/2553.html">2553</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>review</td>
     <td>Restrictions on explicit object member functions</td>
     <td align="center">
@@ -15161,6 +17672,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2554">
     <td><a href="https://cplusplus.github.io/CWG/issues/2554.html">2554</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>review</td>
     <td>Overriding virtual functions, also with explicit object parameters</td>
     <td align="center">
@@ -15171,66 +17683,77 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2555">
     <td><a href="https://cplusplus.github.io/CWG/issues/2555.html">2555</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>tentatively ready</td>
     <td>Ineffective redeclaration prevention for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
+    <td>[<a href="https://wg21.link/stmt.return.coroutine">stmt.return.coroutine</a>]</td>
     <td>CD7</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2557">
     <td><a href="https://cplusplus.github.io/CWG/issues/2557.html">2557</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>review</td>
     <td>Class member access referring to an unrelated class</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2558">
     <td><a href="https://cplusplus.github.io/CWG/issues/2558.html">2558</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Uninitialized subobjects as a result of an immediate invocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2559">
     <td><a href="https://cplusplus.github.io/CWG/issues/2559.html">2559</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Defaulted consteval functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>CD7</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Exceptions thrown during coroutine startup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2563">
     <td><a href="https://cplusplus.github.io/CWG/issues/2563.html">2563</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>review</td>
     <td>Initialization of coroutine result object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2564">
     <td><a href="https://cplusplus.github.io/CWG/issues/2564.html">2564</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>drafting</td>
     <td>Conversion to function pointer with an explicit object parameter</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2565">
     <td><a href="https://cplusplus.github.io/CWG/issues/2565.html">2565</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>open</td>
     <td>Invalid types in the <I>parameter-declaration-clause</I> of a <I>requires-expression</I></td>
     <td align="center">
@@ -15241,1231 +17764,1435 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2566">
     <td><a href="https://cplusplus.github.io/CWG/issues/2566.html">2566</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>review</td>
     <td>Matching deallocation for uncaught exception</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2567">
     <td><a href="https://cplusplus.github.io/CWG/issues/2567.html">2567</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>NAD</td>
     <td>Operator lookup ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>CD7</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2569">
     <td><a href="https://cplusplus.github.io/CWG/issues/2569.html">2569</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>CD6</td>
     <td>Use of <code>decltype(capture)</code> in a lambda's <I>parameter-declaration-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2571">
     <td><a href="https://cplusplus.github.io/CWG/issues/2571.html">2571</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>CD6</td>
     <td>Evaluation order for subscripting</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2572">
     <td><a href="https://cplusplus.github.io/CWG/issues/2572.html">2572</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>review</td>
     <td>Address of overloaded function with no target</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2573">
     <td><a href="https://cplusplus.github.io/CWG/issues/2573.html">2573</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when splicing results in a <I>universal-character-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2574">
     <td><a href="https://cplusplus.github.io/CWG/issues/2574.html">2574</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when lexing unmatched quotes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2575">
     <td><a href="https://cplusplus.github.io/CWG/issues/2575.html">2575</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Undefined behavior when macro-replacing "defined" operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2576">
     <td><a href="https://cplusplus.github.io/CWG/issues/2576.html">2576</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>open</td>
     <td>Undefined behavior with macro-expanded <TT>#include</TT> directives</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2577">
     <td><a href="https://cplusplus.github.io/CWG/issues/2577.html">2577</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>open</td>
     <td>Undefined behavior for preprocessing directives in macro arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2578">
     <td><a href="https://cplusplus.github.io/CWG/issues/2578.html">2578</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when creating an invalid string literal via stringizing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2579">
     <td><a href="https://cplusplus.github.io/CWG/issues/2579.html">2579</a></td>
+    <td>[<a href="https://wg21.link/cpp.concat">cpp.concat</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when token pasting does not create a preprocessing token</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2580">
     <td><a href="https://cplusplus.github.io/CWG/issues/2580.html">2580</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior with <TT>#line</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2581">
     <td><a href="https://cplusplus.github.io/CWG/issues/2581.html">2581</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>open</td>
     <td>Undefined behavior for predefined macros</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2582">
     <td><a href="https://cplusplus.github.io/CWG/issues/2582.html">2582</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD6</td>
     <td>Differing member lookup from nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2583">
     <td><a href="https://cplusplus.github.io/CWG/issues/2583.html">2583</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>C++23</td>
     <td>Common initial sequence should consider over-alignment</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2584">
     <td><a href="https://cplusplus.github.io/CWG/issues/2584.html">2584</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Equivalent types in function template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2585">
     <td><a href="https://cplusplus.github.io/CWG/issues/2585.html">2585</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD6</td>
     <td>Name lookup for coroutine allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2586">
     <td><a href="https://cplusplus.github.io/CWG/issues/2586.html">2586</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>CD6</td>
     <td>Explicit object parameter for assignment and comparison</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2587">
     <td><a href="https://cplusplus.github.io/CWG/issues/2587.html">2587</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>review</td>
     <td>Visible side effects and initial value of an object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD7</td>
     <td>friend declarations and module linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2589">
     <td><a href="https://cplusplus.github.io/CWG/issues/2589.html">2589</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.atomic">temp.constr.atomic</a>]</td>
     <td>review</td>
     <td>Context of access checks during constraint satisfaction checking</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2590">
     <td><a href="https://cplusplus.github.io/CWG/issues/2590.html">2590</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++23</td>
     <td>Underlying type should determine size and alignment requirements of an enum</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
+    <td>[<a href="https://wg21.link/class.union.general">class.union.general</a>]</td>
     <td>CD7</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2592">
     <td><a href="https://cplusplus.github.io/CWG/issues/2592.html">2592</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Missing definition for placement allocation/deallocation function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2593">
     <td><a href="https://cplusplus.github.io/CWG/issues/2593.html">2593</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>review</td>
     <td>Insufficient base class restriction for pointer-to-member expression</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2594">
     <td><a href="https://cplusplus.github.io/CWG/issues/2594.html">2594</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD6</td>
     <td>Disallowing a global function template <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>CD7</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2596">
     <td><a href="https://cplusplus.github.io/CWG/issues/2596.html">2596</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Instantiation of constrained non-template friends</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2597">
     <td><a href="https://cplusplus.github.io/CWG/issues/2597.html">2597</a></td>
+    <td>[<a href="https://wg21.link/module.unit">module.unit</a>]</td>
     <td>CD6</td>
     <td>Replaceable allocation and deallocation functions in the global module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2598">
     <td><a href="https://cplusplus.github.io/CWG/issues/2598.html">2598</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>C++23</td>
     <td>Unions should not require a non-static data member of literal type</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2599">
     <td><a href="https://cplusplus.github.io/CWG/issues/2599.html">2599</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++23</td>
     <td>What does initializing a parameter include?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD7</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2601">
     <td><a href="https://cplusplus.github.io/CWG/issues/2601.html">2601</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++23</td>
     <td>Tracking of created and destroyed subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2602">
     <td><a href="https://cplusplus.github.io/CWG/issues/2602.html">2602</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++23</td>
     <td>consteval defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2603">
     <td><a href="https://cplusplus.github.io/CWG/issues/2603.html">2603</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>C++23</td>
     <td>Holistic functional equivalence for function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2604">
     <td><a href="https://cplusplus.github.io/CWG/issues/2604.html">2604</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++23</td>
     <td>Attributes for an explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2605">
     <td><a href="https://cplusplus.github.io/CWG/issues/2605.html">2605</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>C++23</td>
     <td>Implicit-lifetime aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2606">
     <td><a href="https://cplusplus.github.io/CWG/issues/2606.html">2606</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD6</td>
     <td><TT>static_cast</TT> from "pointer to void" does not handle similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2607">
     <td><a href="https://cplusplus.github.io/CWG/issues/2607.html">2607</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>drafting</td>
     <td>Visibility of enumerator names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2608">
     <td><a href="https://cplusplus.github.io/CWG/issues/2608.html">2608</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD6</td>
     <td>Omitting an empty template argument list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2609">
     <td><a href="https://cplusplus.github.io/CWG/issues/2609.html">2609</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>open</td>
     <td>Padding in class types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2610">
     <td><a href="https://cplusplus.github.io/CWG/issues/2610.html">2610</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++23</td>
     <td>Indirect private base classes in aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2611">
     <td><a href="https://cplusplus.github.io/CWG/issues/2611.html">2611</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++23</td>
     <td>Missing parentheses in expansion of fold-expression could cause syntactic reinterpretation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2612">
     <td><a href="https://cplusplus.github.io/CWG/issues/2612.html">2612</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>C++23</td>
     <td>Incorrect comment in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2613">
     <td><a href="https://cplusplus.github.io/CWG/issues/2613.html">2613</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++23</td>
     <td>Incomplete definition of resumer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2614">
     <td><a href="https://cplusplus.github.io/CWG/issues/2614.html">2614</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>C++23</td>
     <td>Unspecified results for class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2615">
     <td><a href="https://cplusplus.github.io/CWG/issues/2615.html">2615</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>C++23</td>
     <td>Missing <TT>__has_cpp_attribute(assume)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2616">
     <td><a href="https://cplusplus.github.io/CWG/issues/2616.html">2616</a></td>
+    <td>[<a href="https://wg21.link/stmt">stmt</a>]</td>
     <td>C++23</td>
     <td>Imprecise restrictions on <TT>break</TT> and <TT>continue</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2617">
     <td><a href="https://cplusplus.github.io/CWG/issues/2617.html">2617</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>review</td>
     <td>Default template arguments for template members of non-template classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2618">
     <td><a href="https://cplusplus.github.io/CWG/issues/2618.html">2618</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>C++23</td>
     <td>Substitution during deduction should exclude exception specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2619">
     <td><a href="https://cplusplus.github.io/CWG/issues/2619.html">2619</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++23</td>
     <td>Kind of initialization for a <I>designated-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2620">
     <td><a href="https://cplusplus.github.io/CWG/issues/2620.html">2620</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>C++23</td>
     <td>Nonsensical disambiguation rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2621">
     <td><a href="https://cplusplus.github.io/CWG/issues/2621.html">2621</a></td>
+    <td>[<a href="https://wg21.link/enum.udecl">enum.udecl</a>]</td>
     <td>C++23</td>
     <td>Kind of lookup for <TT>using enum</TT> declarations</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2877">2877</a></td>
   </tr>
   <tr id="2622">
     <td><a href="https://cplusplus.github.io/CWG/issues/2622.html">2622</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>C++23</td>
     <td>Compounding types from function and pointer-to-member types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2623">
     <td><a href="https://cplusplus.github.io/CWG/issues/2623.html">2623</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>drafting</td>
     <td>Invoking destroying <TT>operator delete</TT> for constructor failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2624">
     <td><a href="https://cplusplus.github.io/CWG/issues/2624.html">2624</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++23</td>
     <td>Array delete expression with no array cookie</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2625">
     <td><a href="https://cplusplus.github.io/CWG/issues/2625.html">2625</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>C++23</td>
     <td>Deletion of pointer to out-of-lifetime object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2626">
     <td><a href="https://cplusplus.github.io/CWG/issues/2626.html">2626</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>C++23</td>
     <td>Rephrase ones' complement using base-2 representation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2627">
     <td><a href="https://cplusplus.github.io/CWG/issues/2627.html">2627</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++23</td>
     <td>Bit-fields and narrowing conversions</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD7</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2629">
     <td><a href="https://cplusplus.github.io/CWG/issues/2629.html">2629</a></td>
+    <td>[<a href="https://wg21.link/stmt.switch">stmt.switch</a>]</td>
     <td>C++23</td>
     <td>Variables of floating-point type as <TT>switch</TT> conditions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2630">
     <td><a href="https://cplusplus.github.io/CWG/issues/2630.html">2630</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>C++23</td>
     <td>Syntactic specification of class completeness</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2631">
     <td><a href="https://cplusplus.github.io/CWG/issues/2631.html">2631</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Immediate function evaluations in default arguments</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr class="open" id="2632">
     <td><a href="https://cplusplus.github.io/CWG/issues/2632.html">2632</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>drafting</td>
     <td>'user-declared' is not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2633">
     <td><a href="https://cplusplus.github.io/CWG/issues/2633.html">2633</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>typeid of constexpr-unknown dynamic type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD7</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2635">
     <td><a href="https://cplusplus.github.io/CWG/issues/2635.html">2635</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++23</td>
     <td>Constrained structured bindings</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2636">
     <td><a href="https://cplusplus.github.io/CWG/issues/2636.html">2636</a></td>
+    <td>[<a href="https://wg21.link/ub">ub</a>]</td>
     <td>C++23</td>
     <td>Update Annex E based on Unicode 15.0 UAX #31</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
+    <td>[<a href="https://wg21.link/class.pre">class.pre</a>]</td>
     <td>CD7</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2639">
     <td><a href="https://cplusplus.github.io/CWG/issues/2639.html">2639</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++23</td>
     <td>new-lines after phase 1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2640">
     <td><a href="https://cplusplus.github.io/CWG/issues/2640.html">2640</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>C++23</td>
     <td>Allow more characters in an n-char sequence</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2641">
     <td><a href="https://cplusplus.github.io/CWG/issues/2641.html">2641</a></td>
+    <td>[<a href="https://wg21.link/lex.literal">lex.literal</a>]</td>
     <td>C++23</td>
     <td>Redundant specification of value category of literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2642">
     <td><a href="https://cplusplus.github.io/CWG/issues/2642.html">2642</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>C++23</td>
     <td>Inconsistent use of T and C</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2643">
     <td><a href="https://cplusplus.github.io/CWG/issues/2643.html">2643</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>C++23</td>
     <td>Completing a pointer to array of unknown bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2644">
     <td><a href="https://cplusplus.github.io/CWG/issues/2644.html">2644</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++23</td>
     <td>Incorrect comment in example</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="2645">
     <td><a href="https://cplusplus.github.io/CWG/issues/2645.html">2645</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++23</td>
     <td>Unused term "default argument promotions"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2646">
     <td><a href="https://cplusplus.github.io/CWG/issues/2646.html">2646</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++23</td>
     <td>Defaulted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2647">
     <td><a href="https://cplusplus.github.io/CWG/issues/2647.html">2647</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Fix for "needed for constant evaluation"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2648">
     <td><a href="https://cplusplus.github.io/CWG/issues/2648.html">2648</a></td>
+    <td>[<a href="https://wg21.link/over.call">over.call</a>]</td>
     <td>C++23</td>
     <td>Correspondence of surrogate call function and conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2649">
     <td><a href="https://cplusplus.github.io/CWG/issues/2649.html">2649</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>C++23</td>
     <td>Incorrect note about implicit conversion sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2650">
     <td><a href="https://cplusplus.github.io/CWG/issues/2650.html">2650</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>C++23</td>
     <td>Incorrect example for ill-formed non-type template arguments</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2651">
     <td><a href="https://cplusplus.github.io/CWG/issues/2651.html">2651</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>C++23</td>
     <td>Conversion function templates and "noexcept"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2652">
     <td><a href="https://cplusplus.github.io/CWG/issues/2652.html">2652</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++23</td>
     <td>Overbroad definition of <TT>__STDCPP_BFLOAT16_T__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2653">
     <td><a href="https://cplusplus.github.io/CWG/issues/2653.html">2653</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++23</td>
     <td>Can an explicit object parameter have a default argument?</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2654">
     <td><a href="https://cplusplus.github.io/CWG/issues/2654.html">2654</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>C++23</td>
     <td>Un-deprecation of compound volatile assignments</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2655">
     <td><a href="https://cplusplus.github.io/CWG/issues/2655.html">2655</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Instantiation of default arguments in <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2656">
     <td><a href="https://cplusplus.github.io/CWG/issues/2656.html">2656</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Converting consteval lambda to function pointer in non-immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2658">
     <td><a href="https://cplusplus.github.io/CWG/issues/2658.html">2658</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Trivial copying of unions in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2659">
     <td><a href="https://cplusplus.github.io/CWG/issues/2659.html">2659</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++23</td>
     <td>Missing feature-test macro for lifetime extension in range-for loop</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2660">
     <td><a href="https://cplusplus.github.io/CWG/issues/2660.html">2660</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Confusing term "this parameter"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>CD7</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2662">
     <td><a href="https://cplusplus.github.io/CWG/issues/2662.html">2662</a></td>
+    <td>[<a href="https://wg21.link/class.access.general">class.access.general</a>]</td>
     <td>C++23</td>
     <td>Example for member access control vs. overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2663">
     <td><a href="https://cplusplus.github.io/CWG/issues/2663.html">2663</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD7</td>
     <td>Example for member redeclarations with <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2664">
     <td><a href="https://cplusplus.github.io/CWG/issues/2664.html">2664</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Deduction failure in CTAD for alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2665">
     <td><a href="https://cplusplus.github.io/CWG/issues/2665.html">2665</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Replacing a subobject with a complete object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2666">
     <td><a href="https://cplusplus.github.io/CWG/issues/2666.html">2666</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension through <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2667">
     <td><a href="https://cplusplus.github.io/CWG/issues/2667.html">2667</a></td>
+    <td>[<a href="https://wg21.link/cpp.import">cpp.import</a>]</td>
     <td>C++23</td>
     <td>Named module imports do not import macros</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>CD7</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2669">
     <td><a href="https://cplusplus.github.io/CWG/issues/2669.html">2669</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Lifetime extension for aggregate initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2670">
     <td><a href="https://cplusplus.github.io/CWG/issues/2670.html">2670</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Programs and translation units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2671">
     <td><a href="https://cplusplus.github.io/CWG/issues/2671.html">2671</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning.general">dcl.meaning.general</a>]</td>
     <td>open</td>
     <td>friend named by a <I>template-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>CD7</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2673">
     <td><a href="https://cplusplus.github.io/CWG/issues/2673.html">2673</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++23</td>
     <td>User-declared spaceship vs. built-in operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2674">
     <td><a href="https://cplusplus.github.io/CWG/issues/2674.html">2674</a></td>
+    <td>[<a href="https://wg21.link/class.ctor.general">class.ctor.general</a>]</td>
     <td>C++23</td>
     <td>Prohibit explicit object parameters for constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2675">
     <td><a href="https://cplusplus.github.io/CWG/issues/2675.html">2675</a></td>
+    <td>[<a href="https://wg21.link/class.union.general">class.union.general</a>]</td>
     <td>open</td>
     <td><TT>start_lifetime_as</TT>, placement-new, and active union members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2676">
     <td><a href="https://cplusplus.github.io/CWG/issues/2676.html">2676</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Replacing a complete object having base subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2677">
     <td><a href="https://cplusplus.github.io/CWG/issues/2677.html">2677</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Replacing union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2678">
     <td><a href="https://cplusplus.github.io/CWG/issues/2678.html">2678</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td><TT>std::source_location::current</TT> is unimplementable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2679">
     <td><a href="https://cplusplus.github.io/CWG/issues/2679.html">2679</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Implicit conversion sequence with a null pointer constant</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2680">
     <td><a href="https://cplusplus.github.io/CWG/issues/2680.html">2680</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>open</td>
     <td>Class template argument deduction for aggregates with designated initializers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2681">
     <td><a href="https://cplusplus.github.io/CWG/issues/2681.html">2681</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Deducing member array type from string literal</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2682">
     <td><a href="https://cplusplus.github.io/CWG/issues/2682.html">2682</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>C++23</td>
     <td>Templated function vs. function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2683">
     <td><a href="https://cplusplus.github.io/CWG/issues/2683.html">2683</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD7</td>
     <td>Default arguments for member functions of templated nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2684">
     <td><a href="https://cplusplus.github.io/CWG/issues/2684.html">2684</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>open</td>
     <td>thread_local dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2685">
     <td><a href="https://cplusplus.github.io/CWG/issues/2685.html">2685</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Aggregate CTAD, string, and brace elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2686">
     <td><a href="https://cplusplus.github.io/CWG/issues/2686.html">2686</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.constr">temp.constr.constr</a>]</td>
     <td>open</td>
     <td>Pack expansion into a non-pack parameter of a concept</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2687">
     <td><a href="https://cplusplus.github.io/CWG/issues/2687.html">2687</a></td>
+    <td>[<a href="https://wg21.link/over.match.call.general">over.match.call.general</a>]</td>
     <td>C++23</td>
     <td>Calling an explicit object member function via an address-of-overload-set</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2688">
     <td><a href="https://cplusplus.github.io/CWG/issues/2688.html">2688</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Calling explicit object member functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD7</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2690">
     <td><a href="https://cplusplus.github.io/CWG/issues/2690.html">2690</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>C++23</td>
     <td>Semantics of defaulted move assignment operator for unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2691">
     <td><a href="https://cplusplus.github.io/CWG/issues/2691.html">2691</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>C++23</td>
     <td><I>hexadecimal-escape-sequence</I> is too greedy</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2692">
     <td><a href="https://cplusplus.github.io/CWG/issues/2692.html">2692</a></td>
+    <td>[<a href="https://wg21.link/over.match.call.general">over.match.call.general</a>]</td>
     <td>C++23</td>
     <td>Static and explicit object member functions with the same parameter-type-lists</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2693">
     <td><a href="https://cplusplus.github.io/CWG/issues/2693.html">2693</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>open</td>
     <td>Escape sequences for the <I>string-literal</I> of <TT>#line</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2694">
     <td><a href="https://cplusplus.github.io/CWG/issues/2694.html">2694</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td><I>string-literal</I>s of the <TT>_Pragma</TT> operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2695">
     <td><a href="https://cplusplus.github.io/CWG/issues/2695.html">2695</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++23</td>
     <td>Semantic ignorability of attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2696">
     <td><a href="https://cplusplus.github.io/CWG/issues/2696.html">2696</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>dup</td>
     <td>Relational comparisons of pointers to <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2697">
     <td><a href="https://cplusplus.github.io/CWG/issues/2697.html">2697</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Deduction guides using abbreviated function syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2698">
     <td><a href="https://cplusplus.github.io/CWG/issues/2698.html">2698</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>CD7</td>
     <td>Using extended integer types with <TT>z</TT> suffix</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2699">
     <td><a href="https://cplusplus.github.io/CWG/issues/2699.html">2699</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>CD7</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2701">
     <td><a href="https://cplusplus.github.io/CWG/issues/2701.html">2701</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Default arguments in multiple scopes / inheritance of array bounds in the same scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2702">
     <td><a href="https://cplusplus.github.io/CWG/issues/2702.html">2702</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Constant destruction of reference members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2703">
     <td><a href="https://cplusplus.github.io/CWG/issues/2703.html">2703</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>CD7</td>
     <td>Three-way comparison requiring strong ordering for floating-point types, take 2</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2704">
     <td><a href="https://cplusplus.github.io/CWG/issues/2704.html">2704</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>open</td>
     <td>Clarify meaning of "bind directly"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2705">
     <td><a href="https://cplusplus.github.io/CWG/issues/2705.html">2705</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Accessing ambiguous subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2706">
     <td><a href="https://cplusplus.github.io/CWG/issues/2706.html">2706</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Repeated structured binding declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2708">
     <td><a href="https://cplusplus.github.io/CWG/issues/2708.html">2708</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Parenthesized initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2709">
     <td><a href="https://cplusplus.github.io/CWG/issues/2709.html">2709</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>NAD</td>
     <td>Parenthesized initialization of reference-to-aggregate</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2710">
     <td><a href="https://cplusplus.github.io/CWG/issues/2710.html">2710</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Loops in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2711">
     <td><a href="https://cplusplus.github.io/CWG/issues/2711.html">2711</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Source for copy-initializing the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2712">
     <td><a href="https://cplusplus.github.io/CWG/issues/2712.html">2712</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD7</td>
     <td>Simplify restrictions on built-in assignment operator candidates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2713">
     <td><a href="https://cplusplus.github.io/CWG/issues/2713.html">2713</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD7</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD7</td>
     <td>"calling function" for parameter initialization may not exist</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2716">
     <td><a href="https://cplusplus.github.io/CWG/issues/2716.html">2716</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD7</td>
     <td>Rule about self-or-base conversion is normatively redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2717">
     <td><a href="https://cplusplus.github.io/CWG/issues/2717.html">2717</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>CD7</td>
     <td>Pack expansion for <I>alignment-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2718">
     <td><a href="https://cplusplus.github.io/CWG/issues/2718.html">2718</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Type completeness for derived-to-base conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2719">
     <td><a href="https://cplusplus.github.io/CWG/issues/2719.html">2719</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD7</td>
     <td>Creating objects in misaligned storage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2720">
     <td><a href="https://cplusplus.github.io/CWG/issues/2720.html">2720</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Template validity rules for templated entities and alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2721">
     <td><a href="https://cplusplus.github.io/CWG/issues/2721.html">2721</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD7</td>
     <td>When exactly is storage reused?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2722">
     <td><a href="https://cplusplus.github.io/CWG/issues/2722.html">2722</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD7</td>
     <td>Temporary materialization conversion for <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2723">
     <td><a href="https://cplusplus.github.io/CWG/issues/2723.html">2723</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD7</td>
     <td>Range of representable values for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2724">
     <td><a href="https://cplusplus.github.io/CWG/issues/2724.html">2724</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD7</td>
     <td>Clarify rounding for arithmetic right shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2726">
     <td><a href="https://cplusplus.github.io/CWG/issues/2726.html">2726</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>review</td>
     <td>Alternative tokens appearing as <I>attribute-token</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2727">
     <td><a href="https://cplusplus.github.io/CWG/issues/2727.html">2727</a></td>
+    <td>[<a href="https://wg21.link/module.import">module.import</a>]</td>
     <td>open</td>
     <td>Importing header units synthesized from source files</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2729">
     <td><a href="https://cplusplus.github.io/CWG/issues/2729.html">2729</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD7</td>
     <td>Meaning of <I>new-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2730">
     <td><a href="https://cplusplus.github.io/CWG/issues/2730.html">2730</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Comparison templates on enumeration types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2731">
     <td><a href="https://cplusplus.github.io/CWG/issues/2731.html">2731</a></td>
+    <td>[<a href="https://wg21.link/over.ics.user">over.ics.user</a>]</td>
     <td>open</td>
     <td>List-initialization sequence with a user-defined conversion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2732">
     <td><a href="https://cplusplus.github.io/CWG/issues/2732.html">2732</a></td>
+    <td>[<a href="https://wg21.link/module.import">module.import</a>]</td>
     <td>CD7</td>
     <td>Can importable headers react to preprocessor state from point of import?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.unused">dcl.attr.unused</a>]</td>
     <td>CD7</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2734">
     <td><a href="https://cplusplus.github.io/CWG/issues/2734.html">2734</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Immediate forward-declared function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2735">
     <td><a href="https://cplusplus.github.io/CWG/issues/2735.html">2735</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>open</td>
     <td>List-initialization and conversions in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2736">
     <td><a href="https://cplusplus.github.io/CWG/issues/2736.html">2736</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Standard layout class with empty base class also in first member</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2737">
     <td><a href="https://cplusplus.github.io/CWG/issues/2737.html">2737</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>review</td>
     <td>Temporary lifetime extension for reference init-captures</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2738">
     <td><a href="https://cplusplus.github.io/CWG/issues/2738.html">2738</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>review</td>
     <td>"denotes a destructor" is missing specification</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2739">
     <td><a href="https://cplusplus.github.io/CWG/issues/2739.html">2739</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.nested">expr.prim.req.nested</a>]</td>
     <td>open</td>
     <td>Nested requirement not a constant expression</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2740">
     <td><a href="https://cplusplus.github.io/CWG/issues/2740.html">2740</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Too many objects have constexpr-unknown type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2741">
     <td><a href="https://cplusplus.github.io/CWG/issues/2741.html">2741</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Implicit conversion sequence from empty list to array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2742">
     <td><a href="https://cplusplus.github.io/CWG/issues/2742.html">2742</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>drafting</td>
     <td>Guaranteed copy elision for brace-initialization from prvalue</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2743">
     <td><a href="https://cplusplus.github.io/CWG/issues/2743.html">2743</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Copying non-trivial objects nested within a union</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2744">
     <td><a href="https://cplusplus.github.io/CWG/issues/2744.html">2744</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Multiple objects of the same type at the same address</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Dependent odr-use in generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Checking of default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD7</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2750">
     <td><a href="https://cplusplus.github.io/CWG/issues/2750.html">2750</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>construct_at without constructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2751">
     <td><a href="https://cplusplus.github.io/CWG/issues/2751.html">2751</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>NAD</td>
     <td>Order of destruction for parameters for operator functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2752">
     <td><a href="https://cplusplus.github.io/CWG/issues/2752.html">2752</a></td>
+    <td>[<a href="https://wg21.link/lex.fcon">lex.fcon</a>]</td>
     <td>open</td>
     <td>Excess-precision floating-point literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD7</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD7</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2756">
     <td><a href="https://cplusplus.github.io/CWG/issues/2756.html">2756</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>review</td>
     <td>Completion of initialization by delegating constructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2757">
     <td><a href="https://cplusplus.github.io/CWG/issues/2757.html">2757</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>review</td>
     <td>Deleting or deallocating storage of an object during its construction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>CD7</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD7</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs.general">over.match.funcs.general</a>]</td>
     <td>CD7</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>CD7</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2765">
     <td><a href="https://cplusplus.github.io/CWG/issues/2765.html">2765</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Address comparisons between potentially non-unique objects during constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2766">
     <td><a href="https://cplusplus.github.io/CWG/issues/2766.html">2766</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>open</td>
-    <td>Repeated evaluation of a <I>string-literal</I> may yield different
-objects</td>
+    <td>Repeated evaluation of a <I>string-literal</I> may yield different objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2767">
     <td><a href="https://cplusplus.github.io/CWG/issues/2767.html">2767</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Non-defining declarations of anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD7</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2769">
     <td><a href="https://cplusplus.github.io/CWG/issues/2769.html">2769</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Substitution into template parameters and default template arguments should be interleaved</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2770">
     <td><a href="https://cplusplus.github.io/CWG/issues/2770.html">2770</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
     <td align="center">
@@ -16476,462 +19203,539 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>CD7</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.dcl.dcl">diff.cpp03.dcl.dcl</a>]</td>
     <td>CD7</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2773">
     <td><a href="https://cplusplus.github.io/CWG/issues/2773.html">2773</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Naming anonymous union members as class members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2774">
     <td><a href="https://cplusplus.github.io/CWG/issues/2774.html">2774</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependence of <I>requires-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD7</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2776">
     <td><a href="https://cplusplus.github.io/CWG/issues/2776.html">2776</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>open</td>
     <td>Substitution failure and implementation limits</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD7</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2778">
     <td><a href="https://cplusplus.github.io/CWG/issues/2778.html">2778</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>review</td>
     <td>Trivial destructor does not imply constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2779">
     <td><a href="https://cplusplus.github.io/CWG/issues/2779.html">2779</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>open</td>
     <td>Restrictions on the ordinary literal encoding</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD7</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2781">
     <td><a href="https://cplusplus.github.io/CWG/issues/2781.html">2781</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Unclear recursion in the one-definition rule</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2782">
     <td><a href="https://cplusplus.github.io/CWG/issues/2782.html">2782</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Treatment of closure types in the one-definition rule</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
+    <td>[<a href="https://wg21.link/module.global.frag">module.global.frag</a>]</td>
     <td>CD7</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2784">
     <td><a href="https://cplusplus.github.io/CWG/issues/2784.html">2784</a></td>
+    <td>[<a href="https://wg21.link/support.types.layout">support.types.layout</a>]</td>
     <td>open</td>
     <td>Unclear definition of <I>member-designator</I> for <TT>offsetof</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD7</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2786">
     <td><a href="https://cplusplus.github.io/CWG/issues/2786.html">2786</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>open</td>
     <td>Comparing pointers to complete objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2787">
     <td><a href="https://cplusplus.github.io/CWG/issues/2787.html">2787</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>open</td>
     <td>Kind of explicit object copy/move assignment function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2788">
     <td><a href="https://cplusplus.github.io/CWG/issues/2788.html">2788</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Correspondence and redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
+    <td>[<a href="https://wg21.link/over.match.best.general">over.match.best.general</a>]</td>
     <td>CD7</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2790">
     <td><a href="https://cplusplus.github.io/CWG/issues/2790.html">2790</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Aggregate initialization and user-defined conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD7</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD7</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD7</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2794">
     <td><a href="https://cplusplus.github.io/CWG/issues/2794.html">2794</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Uniqueness of lambdas in alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD7</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD7</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2797">
     <td><a href="https://cplusplus.github.io/CWG/issues/2797.html">2797</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>review</td>
     <td>Meaning of "corresponds" for rewritten operator candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2799">
     <td><a href="https://cplusplus.github.io/CWG/issues/2799.html">2799</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>drafting</td>
     <td>Inheriting default constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2800">
     <td><a href="https://cplusplus.github.io/CWG/issues/2800.html">2800</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>review</td>
     <td>Instantiating constexpr variables for potential constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2802">
     <td><a href="https://cplusplus.github.io/CWG/issues/2802.html">2802</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Constrained <TT>auto</TT> and redeclaration with non-abbreviated syntax</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2804">
     <td><a href="https://cplusplus.github.io/CWG/issues/2804.html">2804</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Lookup for determining rewrite targets</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2805">
     <td><a href="https://cplusplus.github.io/CWG/issues/2805.html">2805</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Underspecified selection of deallocation function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD7</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2808">
     <td><a href="https://cplusplus.github.io/CWG/issues/2808.html">2808</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>review</td>
     <td>Explicit specialization of defaulted special member function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD7</td>
     <td>Clarify "use" of main</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr class="open" id="2812">
     <td><a href="https://cplusplus.github.io/CWG/issues/2812.html">2812</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Allocation with explicit alignment</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Class member access with prvalues</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>NAD</td>
     <td>Alignment requirement of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2815">
     <td><a href="https://cplusplus.github.io/CWG/issues/2815.html">2815</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for references/pointers to <TT>noexcept</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2816">
     <td><a href="https://cplusplus.github.io/CWG/issues/2816.html">2816</a></td>
+    <td>[<a href="https://wg21.link/intro.progress">intro.progress</a>]</td>
     <td>review</td>
     <td>Unclear phrasing "may assume ... eventually"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2817">
     <td><a href="https://cplusplus.github.io/CWG/issues/2817.html">2817</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>open</td>
     <td>sizeof(abstract class) is underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD7</td>
     <td>Use of predefined reserved identifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Cast from null pointer value in a constant expression</td>
     <td class="full" align="center">Clang 19 (C++26 onwards)</td>
   </tr>
   <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Value-initialization and default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2821">
     <td><a href="https://cplusplus.github.io/CWG/issues/2821.html">2821</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Lifetime, zero-initialization, and dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.general">basic.stc.general</a>]</td>
     <td>CD7</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD7</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD7</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2826">
     <td><a href="https://cplusplus.github.io/CWG/issues/2826.html">2826</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>drafting</td>
     <td>Missing definition of "temporary expression"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2827">
     <td><a href="https://cplusplus.github.io/CWG/issues/2827.html">2827</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>review</td>
     <td>Representation of unsigned integral types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD7</td>
     <td>Ambiguous interpretation of C-style cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2829">
     <td><a href="https://cplusplus.github.io/CWG/issues/2829.html">2829</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Redundant case in restricting user-defined conversion sequences</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl.general">dcl.decl.general</a>]</td>
     <td>CD7</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2832">
     <td><a href="https://cplusplus.github.io/CWG/issues/2832.html">2832</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Invented temporary variables and temporary objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2833">
     <td><a href="https://cplusplus.github.io/CWG/issues/2833.html">2833</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>review</td>
     <td>Evaluation of odr-use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2834">
     <td><a href="https://cplusplus.github.io/CWG/issues/2834.html">2834</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>review</td>
     <td>Partial ordering and explicit object parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2835">
     <td><a href="https://cplusplus.github.io/CWG/issues/2835.html">2835</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Name-independent declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
+    <td>[<a href="https://wg21.link/conv.rank">conv.rank</a>]</td>
     <td>CD7</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2837">
     <td><a href="https://cplusplus.github.io/CWG/issues/2837.html">2837</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Instantiating and inheriting by-value copy constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2838">
     <td><a href="https://cplusplus.github.io/CWG/issues/2838.html">2838</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>open</td>
     <td>Declaration conflicts in <I>lambda-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2839">
     <td><a href="https://cplusplus.github.io/CWG/issues/2839.html">2839</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Explicit destruction of base classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2840">
     <td><a href="https://cplusplus.github.io/CWG/issues/2840.html">2840</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>open</td>
     <td>Missing requirements for fundamental alignments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2841">
     <td><a href="https://cplusplus.github.io/CWG/issues/2841.html">2841</a></td>
+    <td>[<a href="https://wg21.link/class.ctor.general">class.ctor.general</a>]</td>
     <td>open</td>
     <td>When do const objects start being const?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2842">
     <td><a href="https://cplusplus.github.io/CWG/issues/2842.html">2842</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Preferring an <TT>initializer_list</TT> over a single value</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2843">
     <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
+    <td>[<a href="https://wg21.link/intro.refs">intro.refs</a>]</td>
     <td>CD7</td>
     <td>Undated reference to Unicode makes C++ a moving target</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2844">
     <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Enumerating a finite set of built-in candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2845">
     <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Make the closure type of a captureless lambda a structural type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2846">
     <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD7</td>
     <td>Out-of-class definitions of explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2847">
     <td><a href="https://cplusplus.github.io/CWG/issues/2847.html">2847</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>review</td>
     <td>Constrained explicit specializations of function templates at class scope</td>
     <td align="center">
@@ -16942,228 +19746,266 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2848">
     <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD7</td>
     <td>Omitting an empty template argument list for explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2849">
     <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD7</td>
     <td>Parameter objects are not temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2850">
     <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD7</td>
     <td>Unclear storage duration for function parameter objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2851">
     <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Allow floating-point conversions in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2852">
     <td><a href="https://cplusplus.github.io/CWG/issues/2852.html">2852</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Complete-class contexts and class-scope lambdas</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2853">
     <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD7</td>
     <td>Pointer arithmetic with pointer to hypothetical element</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2854">
     <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD7</td>
     <td>Storage duration of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2855">
     <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
+    <td>[<a href="https://wg21.link/expr.post.incr">expr.post.incr</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior in postfix increment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2856">
     <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>CD7</td>
     <td>Copy-list-initialization with explicit default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2857">
     <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD7</td>
     <td>Argument-dependent lookup with incomplete class types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2858">
     <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD7</td>
     <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2859">
     <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Value-initialization with multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2860">
     <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>dup</td>
     <td>Remove and fix the term "vacuous initialization"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2861">
     <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD7</td>
     <td><TT>dynamic_cast</TT> on bad pointer value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2862">
     <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>review</td>
     <td>Unclear boundaries of template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2863">
     <td><a href="https://cplusplus.github.io/CWG/issues/2863.html">2863</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>drafting</td>
     <td>Unclear synchronization requirements for object lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2864">
     <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Narrowing floating-point conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD7</td>
     <td>Regression on result of conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2866">
     <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>open</td>
     <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD7</td>
     <td>Order of initialization for structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2868">
     <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Self-references in trivially copyable objects as function return values</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2869">
     <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.this">expr.prim.this</a>]</td>
     <td>CD7</td>
     <td><TT>this</TT> in local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2870">
     <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD7</td>
     <td>Combining absent <I>encoding-prefix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2871">
     <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>CD7</td>
     <td>User-declared constructor templates inhibiting default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2872">
     <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD7</td>
     <td>Linkage and unclear "can be referred to"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2873">
     <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>open</td>
     <td>Taking the address of a function involving template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2874">
     <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD7</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2875">
     <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
+    <td>[<a href="https://wg21.link/diff.expr">diff.expr</a>]</td>
     <td>tentatively ready</td>
     <td>Missing support for round-tripping null pointer values through indirection/address operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2876">
     <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD7</td>
     <td>Disambiguation of <TT>T x = delete("text")</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2877">
     <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
+    <td>[<a href="https://wg21.link/enum.udecl">enum.udecl</a>]</td>
     <td>CD7</td>
     <td>Type-only lookup for <I>using-enum-declarator</I></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2878">
     <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>open</td>
     <td>C-style casts to reference types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2879">
     <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD7</td>
     <td>Undesired outcomes with <TT>const_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2880">
     <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>Accessibility check for destructor of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Unclear treatment of conversion to <TT>void</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2883">
     <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Definition of "odr-usable" ignores lambda scopes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2884">
     <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>dup</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2885">
     <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>review</td>
     <td>Non-eligible trivial default constructors</td>
     <td align="center">
@@ -17174,192 +20016,224 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2886">
     <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD7</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2887">
     <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.expr">diff.cpp03.expr</a>]</td>
     <td>CD7</td>
     <td>Missing compatibility entries for xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2888">
     <td><a href="https://cplusplus.github.io/CWG/issues/2888.html">2888</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>review</td>
     <td>Missing cases for reference and array types for argument-dependent lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2889">
     <td><a href="https://cplusplus.github.io/CWG/issues/2889.html">2889</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Requiring an accessible destructor for destroying operator delete</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2890">
     <td><a href="https://cplusplus.github.io/CWG/issues/2890.html">2890</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>CD7</td>
     <td>Defining members of local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2891">
     <td><a href="https://cplusplus.github.io/CWG/issues/2891.html">2891</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD7</td>
     <td>Normative status of implementation limits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2892">
     <td><a href="https://cplusplus.github.io/CWG/issues/2892.html">2892</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>CD7</td>
     <td>Unclear usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2893">
     <td><a href="https://cplusplus.github.io/CWG/issues/2893.html">2893</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Instantiations in discarded <TT>if constexpr</TT> substatements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2894">
     <td><a href="https://cplusplus.github.io/CWG/issues/2894.html">2894</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD7</td>
     <td>Functional casts create prvalues of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2895">
     <td><a href="https://cplusplus.github.io/CWG/issues/2895.html">2895</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Initialization should ignore the destination type's cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2896">
     <td><a href="https://cplusplus.github.io/CWG/issues/2896.html">2896</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>review</td>
     <td>Template argument deduction involving exception specifications</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2897">
     <td><a href="https://cplusplus.github.io/CWG/issues/2897.html">2897</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Copying potentially-overlapping union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2898">
     <td><a href="https://cplusplus.github.io/CWG/issues/2898.html">2898</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>CD7</td>
     <td>Clarify implicit conversion sequence from <I>cv</I> <TT>T</TT> to <TT>T</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2899">
     <td><a href="https://cplusplus.github.io/CWG/issues/2899.html">2899</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD7</td>
     <td>Bad value representations should cause undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2900">
     <td><a href="https://cplusplus.github.io/CWG/issues/2900.html">2900</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Deduction of non-type template arguments with placeholder types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2901">
     <td><a href="https://cplusplus.github.io/CWG/issues/2901.html">2901</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD7</td>
     <td>Unclear semantics for near-match aliased access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2902">
     <td><a href="https://cplusplus.github.io/CWG/issues/2902.html">2902</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.general">expr.prim.id.general</a>]</td>
     <td>review</td>
     <td>Implicit <TT>this</TT> transformation outside of permitted contexts</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2903">
     <td><a href="https://cplusplus.github.io/CWG/issues/2903.html">2903</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>drafting</td>
     <td>Can we omit the <TT>template</TT> disambiguator in <I>nested-name-specifier</I>s in type-only contexts?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2904">
     <td><a href="https://cplusplus.github.io/CWG/issues/2904.html">2904</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>open</td>
     <td>Introducing <I>template-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2905">
     <td><a href="https://cplusplus.github.io/CWG/issues/2905.html">2905</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD7</td>
     <td>Value-dependence of <I>noexcept-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2906">
     <td><a href="https://cplusplus.github.io/CWG/issues/2906.html">2906</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD7</td>
     <td>Lvalue-to-rvalue conversion of class types for conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2907">
     <td><a href="https://cplusplus.github.io/CWG/issues/2907.html">2907</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Constant lvalue-to-rvalue conversion on uninitialized <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2908">
     <td><a href="https://cplusplus.github.io/CWG/issues/2908.html">2908</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>CD7</td>
     <td>Counting physical source lines for <TT>__LINE__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2909">
     <td><a href="https://cplusplus.github.io/CWG/issues/2909.html">2909</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Subtle difference between constant-initialized and constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2910">
     <td><a href="https://cplusplus.github.io/CWG/issues/2910.html">2910</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Effect of <I>requirement-parameter-list</I>s on odr-usability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2911">
     <td><a href="https://cplusplus.github.io/CWG/issues/2911.html">2911</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>CD7</td>
     <td>Unclear meaning of expressions "appearing within" subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2912">
     <td><a href="https://cplusplus.github.io/CWG/issues/2912.html">2912</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Too-large value for size in array new</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2913">
     <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2914">
     <td><a href="https://cplusplus.github.io/CWG/issues/2914.html">2914</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>review</td>
     <td>Unclear order of initialization of static and thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2915">
     <td><a href="https://cplusplus.github.io/CWG/issues/2915.html">2915</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD7</td>
     <td>Explicit object parameters of type <TT>void</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2916">
     <td><a href="https://cplusplus.github.io/CWG/issues/2916.html">2916</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>review</td>
     <td>Variable template partial specializations should not be declared <TT>static</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2917">
     <td><a href="https://cplusplus.github.io/CWG/issues/2917.html">2917</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>review</td>
     <td>Disallow multiple <I>friend-type-specifier</I>s for a friend template</td>
     <td align="center">
@@ -17370,528 +20244,616 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2918">
     <td><a href="https://cplusplus.github.io/CWG/issues/2918.html">2918</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD7</td>
     <td>Consideration of constraints for address of overloaded function</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2919">
     <td><a href="https://cplusplus.github.io/CWG/issues/2919.html">2919</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>CD7</td>
     <td>Conversion function candidates for initialization of const lvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2920">
     <td><a href="https://cplusplus.github.io/CWG/issues/2920.html">2920</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>The <TT>template</TT> keyword for base classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2921">
     <td><a href="https://cplusplus.github.io/CWG/issues/2921.html">2921</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD7</td>
     <td>Exporting redeclarations of entities not attached to a named module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2922">
     <td><a href="https://cplusplus.github.io/CWG/issues/2922.html">2922</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>constexpr placement-new is too permissive</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2923">
     <td><a href="https://cplusplus.github.io/CWG/issues/2923.html">2923</a></td>
+    <td>[<a href="https://wg21.link/intro.progress">intro.progress</a>]</td>
     <td>tentatively ready</td>
     <td>Note about infinite loops and execution steps</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2924">
     <td><a href="https://cplusplus.github.io/CWG/issues/2924.html">2924</a></td>
+    <td>[<a href="https://wg21.link/defns.undefined">defns.undefined</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2925">
     <td><a href="https://cplusplus.github.io/CWG/issues/2925.html">2925</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Deleting a pointer to an incomplete enumeration type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2926">
     <td><a href="https://cplusplus.github.io/CWG/issues/2926.html">2926</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>drafting</td>
     <td>Lookup context for dependent qualified names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2927">
     <td><a href="https://cplusplus.github.io/CWG/issues/2927.html">2927</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD7</td>
     <td>Unclear status of translation unit with <TT>module</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2928">
     <td><a href="https://cplusplus.github.io/CWG/issues/2928.html">2928</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>open</td>
     <td>No ordering for initializing thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2929">
     <td><a href="https://cplusplus.github.io/CWG/issues/2929.html">2929</a></td>
+    <td>[<a href="https://wg21.link/basic.start.term">basic.start.term</a>]</td>
     <td>review</td>
     <td>Lifetime of trivially-destructible static or thread-local objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2930">
     <td><a href="https://cplusplus.github.io/CWG/issues/2930.html">2930</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>CD7</td>
     <td>Unclear term "copy/move operation" in specification of copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2931">
     <td><a href="https://cplusplus.github.io/CWG/issues/2931.html">2931</a></td>
+    <td>[<a href="https://wg21.link/over.oper.general">over.oper.general</a>]</td>
     <td>CD7</td>
     <td>Restrictions on operator functions that are explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2932">
     <td><a href="https://cplusplus.github.io/CWG/issues/2932.html">2932</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>review</td>
     <td>Value range of empty enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2933">
     <td><a href="https://cplusplus.github.io/CWG/issues/2933.html">2933</a></td>
+    <td>[<a href="https://wg21.link/expr.type">expr.type</a>]</td>
     <td>CD7</td>
     <td>Dangling references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2934">
     <td><a href="https://cplusplus.github.io/CWG/issues/2934.html">2934</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Unclear semantics of exception escaping from <TT>unhandled_exception</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2935">
     <td><a href="https://cplusplus.github.io/CWG/issues/2935.html">2935</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Destroying the coroutine state when initial-await-resume-called is false</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2936">
     <td><a href="https://cplusplus.github.io/CWG/issues/2936.html">2936</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD7</td>
     <td>Local classes of templated functions should be part of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2937">
     <td><a href="https://cplusplus.github.io/CWG/issues/2937.html">2937</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Grammar for <I>preprocessing-file</I> has no normative effect</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2938">
     <td><a href="https://cplusplus.github.io/CWG/issues/2938.html">2938</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Inheriting linkage from a previous declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2939">
     <td><a href="https://cplusplus.github.io/CWG/issues/2939.html">2939</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD7</td>
     <td>Do not allow <TT>reinterpret_cast</TT> from prvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2940">
     <td><a href="https://cplusplus.github.io/CWG/issues/2940.html">2940</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>review</td>
     <td>Definition of "object"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2941">
     <td><a href="https://cplusplus.github.io/CWG/issues/2941.html">2941</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension for function-style cast to reference type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2942">
     <td><a href="https://cplusplus.github.io/CWG/issues/2942.html">2942</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Packs in a function's parameter-type-list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2943">
     <td><a href="https://cplusplus.github.io/CWG/issues/2943.html">2943</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.nodiscard">dcl.attr.nodiscard</a>]</td>
     <td>CD7</td>
     <td>Discarding a void return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2944">
     <td><a href="https://cplusplus.github.io/CWG/issues/2944.html">2944</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Unsequenced <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2945">
     <td><a href="https://cplusplus.github.io/CWG/issues/2945.html">2945</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Redundant constraints on matching function template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2946">
     <td><a href="https://cplusplus.github.io/CWG/issues/2946.html">2946</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Dependent call equivalence in non-ADL cases</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2947">
     <td><a href="https://cplusplus.github.io/CWG/issues/2947.html">2947</a></td>
+    <td>[<a href="https://wg21.link/cpp.module">cpp.module</a>]</td>
     <td>open</td>
     <td>Limiting macro expansion in <I>pp-module</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2948">
     <td><a href="https://cplusplus.github.io/CWG/issues/2948.html">2948</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>open</td>
     <td>Late ambiguity for partial template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2949">
     <td><a href="https://cplusplus.github.io/CWG/issues/2949.html">2949</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Treatment of ellipsis during partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2950">
     <td><a href="https://cplusplus.github.io/CWG/issues/2950.html">2950</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>open</td>
     <td>Value preservation in enumeration vs. integer bit-fields</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2951">
     <td><a href="https://cplusplus.github.io/CWG/issues/2951.html">2951</a></td>
+    <td>[<a href="https://wg21.link/temp.decls.general">temp.decls.general</a>]</td>
     <td>open</td>
     <td>Distinguishing a primary template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2952">
     <td><a href="https://cplusplus.github.io/CWG/issues/2952.html">2952</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Vacuous initialization for subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2953">
     <td><a href="https://cplusplus.github.io/CWG/issues/2953.html">2953</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>open</td>
     <td>Value representation for non-trivially-copyable types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2954">
     <td><a href="https://cplusplus.github.io/CWG/issues/2954.html">2954</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>NAD</td>
     <td>Simultaneous modifications of an atomic object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2955">
     <td><a href="https://cplusplus.github.io/CWG/issues/2955.html">2955</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>open</td>
     <td>Unify rules about conflicting unordered accesses</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2956">
     <td><a href="https://cplusplus.github.io/CWG/issues/2956.html">2956</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>open</td>
     <td>Missing allowance for pseudo-destructors in qualified lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2957">
     <td><a href="https://cplusplus.github.io/CWG/issues/2957.html">2957</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Evaluating a reference member should constitute access</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2958">
     <td><a href="https://cplusplus.github.io/CWG/issues/2958.html">2958</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Overload resolution involving lvalue transformation and qualification conversion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2959">
     <td><a href="https://cplusplus.github.io/CWG/issues/2959.html">2959</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Naming enumerators in class member access expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2960">
     <td><a href="https://cplusplus.github.io/CWG/issues/2960.html">2960</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Introduce discontiguous object lifetime</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2961">
     <td><a href="https://cplusplus.github.io/CWG/issues/2961.html">2961</a></td>
+    <td>[<a href="https://wg21.link/temp.constr">temp.constr</a>]</td>
     <td>open</td>
     <td>Checking of ill-formed types in <I>constraint-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2962">
     <td><a href="https://cplusplus.github.io/CWG/issues/2962.html">2962</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Evaluation of destructor call for variable with constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2963">
     <td><a href="https://cplusplus.github.io/CWG/issues/2963.html">2963</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>open</td>
     <td>Paradoxical variable-or-function declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2964">
     <td><a href="https://cplusplus.github.io/CWG/issues/2964.html">2964</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>open</td>
     <td>Reading "invalid pointer values"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2965">
     <td><a href="https://cplusplus.github.io/CWG/issues/2965.html">2965</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.temp">basic.scope.temp</a>]</td>
     <td>open</td>
     <td>Generic lambdas do not have a template parameter scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2966">
     <td><a href="https://cplusplus.github.io/CWG/issues/2966.html">2966</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>Alignment and value representation of <TT>std::nullptr_t</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2967">
     <td><a href="https://cplusplus.github.io/CWG/issues/2967.html">2967</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>open</td>
     <td>Explicit conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2968">
     <td><a href="https://cplusplus.github.io/CWG/issues/2968.html">2968</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>open</td>
     <td>Name lookup result for <I>typedef-name</I> vs. <I>class-name</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2969">
     <td><a href="https://cplusplus.github.io/CWG/issues/2969.html">2969</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>open</td>
     <td>Scopes in the <I>function-try-block</I> of a constructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2970">
     <td><a href="https://cplusplus.github.io/CWG/issues/2970.html">2970</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>CD7</td>
     <td>Races with <TT>volatile sig_atomic_t</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2971">
     <td><a href="https://cplusplus.github.io/CWG/issues/2971.html">2971</a></td>
+    <td>[<a href="https://wg21.link/module.global.frag">module.global.frag</a>]</td>
     <td>open</td>
     <td>Specializations for a class are not decl-reachable</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2972">
     <td><a href="https://cplusplus.github.io/CWG/issues/2972.html">2972</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>open</td>
     <td>Declarative <I>nested-name-specifier</I> naming a partial specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2973">
     <td><a href="https://cplusplus.github.io/CWG/issues/2973.html">2973</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>open</td>
     <td>Does an <I>alias-declaration</I> introduce a name for linkage purposes?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2974">
     <td><a href="https://cplusplus.github.io/CWG/issues/2974.html">2974</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Non-deduced context for <I>qualified-id</I> naming a template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2975">
     <td><a href="https://cplusplus.github.io/CWG/issues/2975.html">2975</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.normal">temp.constr.normal</a>]</td>
     <td>open</td>
     <td>Effect of concept <I>template-head</I> on parameter mappings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2976">
     <td><a href="https://cplusplus.github.io/CWG/issues/2976.html">2976</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>review</td>
     <td>Transferring control out of a function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2977">
     <td><a href="https://cplusplus.github.io/CWG/issues/2977.html">2977</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>review</td>
     <td>Initialization with string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2978">
     <td><a href="https://cplusplus.github.io/CWG/issues/2978.html">2978</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Deduction involving reference to similar types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2979">
     <td><a href="https://cplusplus.github.io/CWG/issues/2979.html">2979</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Duplicate declarations of enumerations in class scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2980">
     <td><a href="https://cplusplus.github.io/CWG/issues/2980.html">2980</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>Constraints on template template parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2981">
     <td><a href="https://cplusplus.github.io/CWG/issues/2981.html">2981</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>open</td>
     <td>Usual arithmetic conversions and result types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2982">
     <td><a href="https://cplusplus.github.io/CWG/issues/2982.html">2982</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.decl">temp.deduct.decl</a>]</td>
     <td>CD7</td>
     <td>Deduction in <I>type-constraint</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2983">
     <td><a href="https://cplusplus.github.io/CWG/issues/2983.html">2983</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>review</td>
     <td>Non-type template parameters are not variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2984">
     <td><a href="https://cplusplus.github.io/CWG/issues/2984.html">2984</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependent structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2985">
     <td><a href="https://cplusplus.github.io/CWG/issues/2985.html">2985</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Unclear rules for reference initialization with conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2986">
     <td><a href="https://cplusplus.github.io/CWG/issues/2986.html">2986</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Creating objects within a mutable member of a const object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2987">
     <td><a href="https://cplusplus.github.io/CWG/issues/2987.html">2987</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Remove dilapidated wording from <TT>static_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2988">
     <td><a href="https://cplusplus.github.io/CWG/issues/2988.html">2988</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Is a closure type from a <I>lambda-expression</I> appearing in a <I>concept-definition</I> a TU-local entity?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2989">
     <td><a href="https://cplusplus.github.io/CWG/issues/2989.html">2989</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.paren">expr.prim.paren</a>]</td>
     <td>open</td>
     <td>Remove misleading general allowance for parentheses</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2990">
     <td><a href="https://cplusplus.github.io/CWG/issues/2990.html">2990</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD7</td>
     <td>Exporting redeclarations of namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2991">
     <td><a href="https://cplusplus.github.io/CWG/issues/2991.html">2991</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>open</td>
     <td>"array size" is vague</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2992">
     <td><a href="https://cplusplus.github.io/CWG/issues/2992.html">2992</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>open</td>
     <td>Labels do not have names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2993">
     <td><a href="https://cplusplus.github.io/CWG/issues/2993.html">2993</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td>Body of a destructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2994">
     <td><a href="https://cplusplus.github.io/CWG/issues/2994.html">2994</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>open</td>
     <td>Allowing template parameters following template parameter packs that are pack expansions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2995">
     <td><a href="https://cplusplus.github.io/CWG/issues/2995.html">2995</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Meaning of flowing off the end of a function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2996">
     <td><a href="https://cplusplus.github.io/CWG/issues/2996.html">2996</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.atomic">temp.constr.atomic</a>]</td>
     <td>open</td>
     <td>Impenetrable definition of atomic constraint</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2997">
     <td><a href="https://cplusplus.github.io/CWG/issues/2997.html">2997</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>open</td>
     <td>Defaulted functions with deleted definition</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2998">
     <td><a href="https://cplusplus.github.io/CWG/issues/2998.html">2998</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>open</td>
     <td>Missing deduction consistency check for partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2999">
     <td><a href="https://cplusplus.github.io/CWG/issues/2999.html">2999</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>open</td>
     <td>Trivial unions changing existing behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3000">
     <td><a href="https://cplusplus.github.io/CWG/issues/3000.html">3000</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>review</td>
     <td>Handling of cv-qualified class types in conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3001">
     <td><a href="https://cplusplus.github.io/CWG/issues/3001.html">3001</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>tentatively ready</td>
     <td>Inconsistent restrictions for <TT>static_cast</TT> on pointers to out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3002">
     <td><a href="https://cplusplus.github.io/CWG/issues/3002.html">3002</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.temp">temp.dep.temp</a>]</td>
     <td>tentatively ready</td>
     <td>Template parameter/argument confusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3003">
     <td><a href="https://cplusplus.github.io/CWG/issues/3003.html">3003</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>review</td>
     <td>Naming a deducible template for class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3004">
     <td><a href="https://cplusplus.github.io/CWG/issues/3004.html">3004</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>tentatively ready</td>
     <td>Pointer arithmetic on array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3005">
     <td><a href="https://cplusplus.github.io/CWG/issues/3005.html">3005</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>tentatively ready</td>
     <td>Function parameters should never be name-independent</td>
     <td align="center">
@@ -17902,570 +20864,665 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3006">
     <td><a href="https://cplusplus.github.io/CWG/issues/3006.html">3006</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>review</td>
     <td>Vague restrictions for explicit instantiations of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3007">
     <td><a href="https://cplusplus.github.io/CWG/issues/3007.html">3007</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>open</td>
     <td>Access checking during synthesis of defaulted comparison operator, take 2</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3008">
     <td><a href="https://cplusplus.github.io/CWG/issues/3008.html">3008</a></td>
+    <td>[<a href="https://wg21.link/diff.dcl">diff.dcl</a>]</td>
     <td>tentatively ready</td>
     <td>Missing Annex C entry for <TT>void</TT> object declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3009">
     <td><a href="https://cplusplus.github.io/CWG/issues/3009.html">3009</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Unclear rules for constant initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3010">
     <td><a href="https://cplusplus.github.io/CWG/issues/3010.html">3010</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>constexpr placement-new should require transparent replaceability</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3011">
     <td><a href="https://cplusplus.github.io/CWG/issues/3011.html">3011</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>tentatively ready</td>
     <td>Parenthesized aggregate initialization for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3012">
     <td><a href="https://cplusplus.github.io/CWG/issues/3012.html">3012</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>open</td>
     <td>Deviating <TT>constexpr</TT> or <TT>consteval</TT> across translation units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3013">
     <td><a href="https://cplusplus.github.io/CWG/issues/3013.html">3013</a></td>
+    <td>[<a href="https://wg21.link/cpp.embed.gen">cpp.embed.gen</a>]</td>
     <td>CD7</td>
     <td>Disallowing macros for <TT>#embed</TT> parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3014">
     <td><a href="https://cplusplus.github.io/CWG/issues/3014.html">3014</a></td>
+    <td>[<a href="https://wg21.link/cpp.embed.gen">cpp.embed.gen</a>]</td>
     <td>CD7</td>
     <td>Comma-delimited vs. comma-separated output for <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3015">
     <td><a href="https://cplusplus.github.io/CWG/issues/3015.html">3015</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>CD7</td>
     <td>Handling of <I>header-name</I>s for <TT>#include</TT> and <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3016">
     <td><a href="https://cplusplus.github.io/CWG/issues/3016.html">3016</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Satisfying the syntactic requirements of <TT>#include</TT> and <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3017">
     <td><a href="https://cplusplus.github.io/CWG/issues/3017.html">3017</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Commas in controlling expression of conditional inclusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3018">
     <td><a href="https://cplusplus.github.io/CWG/issues/3018.html">3018</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Validity of <TT>defined</TT> in <TT>__has_embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3019">
     <td><a href="https://cplusplus.github.io/CWG/issues/3019.html">3019</a></td>
+    <td>[<a href="https://wg21.link/lex.header">lex.header</a>]</td>
     <td>open</td>
     <td>Restrictions on character sequences in <I>header-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3020">
     <td><a href="https://cplusplus.github.io/CWG/issues/3020.html">3020</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Missing specification for <TT>__has_cpp_attribute(indeterminate)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3021">
     <td><a href="https://cplusplus.github.io/CWG/issues/3021.html">3021</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.order">temp.constr.order</a>]</td>
     <td>drafting</td>
     <td>Subsumption rules for fold expanded constraints</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3022">
     <td><a href="https://cplusplus.github.io/CWG/issues/3022.html">3022</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>review</td>
     <td>Redundant specification of explicit destructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3023">
     <td><a href="https://cplusplus.github.io/CWG/issues/3023.html">3023</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>open</td>
     <td>Default arguments in list-initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3024">
     <td><a href="https://cplusplus.github.io/CWG/issues/3024.html">3024</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>open</td>
     <td>Alignment of references</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3025">
     <td><a href="https://cplusplus.github.io/CWG/issues/3025.html">3025</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>open</td>
     <td>Deallocation functions returning void</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3026">
     <td><a href="https://cplusplus.github.io/CWG/issues/3026.html">3026</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>open</td>
     <td>Class for pointer-to-member formation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3027">
     <td><a href="https://cplusplus.github.io/CWG/issues/3027.html">3027</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>open</td>
     <td>Equivalence of <I>pack-index-specifier</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3028">
     <td><a href="https://cplusplus.github.io/CWG/issues/3028.html">3028</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td>A <I>using-declarator</I> should bind a name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3029">
     <td><a href="https://cplusplus.github.io/CWG/issues/3029.html">3029</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>drafting</td>
     <td>Confusing note about ordinary character types for aligned memory areas</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3030">
     <td><a href="https://cplusplus.github.io/CWG/issues/3030.html">3030</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>open</td>
     <td>Initializing array prvalues of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3031">
     <td><a href="https://cplusplus.github.io/CWG/issues/3031.html">3031</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs.general">over.match.funcs.general</a>]</td>
     <td>open</td>
     <td>Finding declarations for conversion operators for access checking</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3032">
     <td><a href="https://cplusplus.github.io/CWG/issues/3032.html">3032</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.general">temp.arg.general</a>]</td>
     <td>tentatively ready</td>
     <td>Template argument disambiguation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3033">
     <td><a href="https://cplusplus.github.io/CWG/issues/3033.html">3033</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.namespace">basic.scope.namespace</a>]</td>
     <td>open</td>
     <td>Scope after <I>declarator-id</I> before determining correspondence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3034">
     <td><a href="https://cplusplus.github.io/CWG/issues/3034.html">3034</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Infinite recursion should hit an implementation limit</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3035">
     <td><a href="https://cplusplus.github.io/CWG/issues/3035.html">3035</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Lambda expressions in anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3036">
     <td><a href="https://cplusplus.github.io/CWG/issues/3036.html">3036</a></td>
+    <td>[<a href="https://wg21.link/basic.extended.fp">basic.extended.fp</a>]</td>
     <td>open</td>
     <td>Extended floating-point types should not be cv-qualified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3037">
     <td><a href="https://cplusplus.github.io/CWG/issues/3037.html">3037</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td>Name lookup results for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3038">
     <td><a href="https://cplusplus.github.io/CWG/issues/3038.html">3038</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>open</td>
     <td>Ignorability of attributes, again</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3039">
     <td><a href="https://cplusplus.github.io/CWG/issues/3039.html">3039</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Undefined behavior from implicit object creation ignores observable checkpoints</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3040">
     <td><a href="https://cplusplus.github.io/CWG/issues/3040.html">3040</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Mishandling of lambda coroutines</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3041">
     <td><a href="https://cplusplus.github.io/CWG/issues/3041.html">3041</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Overly aggressive rule for deleting the destructor of a union</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3042">
     <td><a href="https://cplusplus.github.io/CWG/issues/3042.html">3042</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>open</td>
     <td>Implicit object creation is insufficient to model effective type rule of C</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3043">
     <td><a href="https://cplusplus.github.io/CWG/issues/3043.html">3043</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension for temporaries in expansion statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3044">
     <td><a href="https://cplusplus.github.io/CWG/issues/3044.html">3044</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Iterating expansion statements woes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3045">
     <td><a href="https://cplusplus.github.io/CWG/issues/3045.html">3045</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>tentatively ready</td>
     <td>Regularizing environment interactions of expansion statement</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3046">
     <td><a href="https://cplusplus.github.io/CWG/issues/3046.html">3046</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>open</td>
     <td>Enumerations as part of the common initial sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3047">
     <td><a href="https://cplusplus.github.io/CWG/issues/3047.html">3047</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Calling destructors on out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3048">
     <td><a href="https://cplusplus.github.io/CWG/issues/3048.html">3048</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Empty destructuring expansion statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3049">
     <td><a href="https://cplusplus.github.io/CWG/issues/3049.html">3049</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Implicitly deleted move operation should not disable trivial relocation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3050">
     <td><a href="https://cplusplus.github.io/CWG/issues/3050.html">3050</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.deprecated">dcl.attr.deprecated</a>]</td>
     <td>open</td>
     <td>[[deprecated]] for class template partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3051">
     <td><a href="https://cplusplus.github.io/CWG/issues/3051.html">3051</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Missing specification for types of member subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3052">
     <td><a href="https://cplusplus.github.io/CWG/issues/3052.html">3052</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Unclear handling of checks on discarded <TT>return</TT> statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3053">
     <td><a href="https://cplusplus.github.io/CWG/issues/3053.html">3053</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>tentatively ready</td>
     <td>Allowing <TT>#undef likely</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3054">
     <td><a href="https://cplusplus.github.io/CWG/issues/3054.html">3054</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Use of default arguments depending on shape of <I>postfix-expression</I> in a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3055">
     <td><a href="https://cplusplus.github.io/CWG/issues/3055.html">3055</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>open</td>
     <td>Misleading body for surrogate call function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3056">
     <td><a href="https://cplusplus.github.io/CWG/issues/3056.html">3056</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.type">expr.prim.req.type</a>]</td>
     <td>open</td>
     <td>Missing semicolons in grammar for <I>type-requirement</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3057">
     <td><a href="https://cplusplus.github.io/CWG/issues/3057.html">3057</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>open</td>
     <td>Ranking of derived-to-base conversions should ignore reference binding</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3058">
     <td><a href="https://cplusplus.github.io/CWG/issues/3058.html">3058</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>open</td>
     <td>"Program point" is not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3059">
     <td><a href="https://cplusplus.github.io/CWG/issues/3059.html">3059</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td><TT>throw;</TT> in constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3060">
     <td><a href="https://cplusplus.github.io/CWG/issues/3060.html">3060</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>open</td>
     <td>Change in behavior for <TT>noexcept</TT> <TT>main</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3061">
     <td><a href="https://cplusplus.github.io/CWG/issues/3061.html">3061</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Trailing comma in an <I>expansion-init-list</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3062">
     <td><a href="https://cplusplus.github.io/CWG/issues/3062.html">3062</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Overlapping specification of default template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3063">
     <td><a href="https://cplusplus.github.io/CWG/issues/3063.html">3063</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension of temporaries past function return</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3064">
     <td><a href="https://cplusplus.github.io/CWG/issues/3064.html">3064</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Mishandling of placement-new in lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3065">
     <td><a href="https://cplusplus.github.io/CWG/issues/3065.html">3065</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>open</td>
     <td>Reachability and completeness of types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3066">
     <td><a href="https://cplusplus.github.io/CWG/issues/3066.html">3066</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>tentatively ready</td>
     <td>Declarative <I>nested-name-specifier</I> in explicit instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3067">
     <td><a href="https://cplusplus.github.io/CWG/issues/3067.html">3067</a></td>
+    <td>[<a href="https://wg21.link/conv.array">conv.array</a>]</td>
     <td>open</td>
     <td>Array-to-pointer conversion with object type mismatch</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3068">
     <td><a href="https://cplusplus.github.io/CWG/issues/3068.html">3068</a></td>
+    <td>[<a href="https://wg21.link/class.access.general">class.access.general</a>]</td>
     <td>open</td>
     <td>Access checking in friends involving <I>qualified-id</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3069">
     <td><a href="https://cplusplus.github.io/CWG/issues/3069.html">3069</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.normal">temp.constr.normal</a>]</td>
     <td>open</td>
     <td>Reference to wrong placeholder</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3070">
     <td><a href="https://cplusplus.github.io/CWG/issues/3070.html">3070</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Trivial assignment can skip member subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3071">
     <td><a href="https://cplusplus.github.io/CWG/issues/3071.html">3071</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>open</td>
     <td>Negative <TT>tuple_size</TT> in structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3072">
     <td><a href="https://cplusplus.github.io/CWG/issues/3072.html">3072</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Incorrect examples for lambda SFINAE</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3073">
     <td><a href="https://cplusplus.github.io/CWG/issues/3073.html">3073</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>open</td>
     <td>Dependence of <I>R</I> on <TT>T2</TT> is unclear</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3074">
     <td><a href="https://cplusplus.github.io/CWG/issues/3074.html">3074</a></td>
+    <td>[<a href="https://wg21.link/cpp.module">cpp.module</a>]</td>
     <td>tentatively ready</td>
     <td>Redundant ill-formedness for module macros</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3075">
     <td><a href="https://cplusplus.github.io/CWG/issues/3075.html">3075</a></td>
+    <td>[<a href="https://wg21.link/cpp.import">cpp.import</a>]</td>
     <td>tentatively ready</td>
     <td>Unclear matching of import directive</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3076">
     <td><a href="https://cplusplus.github.io/CWG/issues/3076.html">3076</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>tentatively ready</td>
     <td>Remove unnecessary IFNDR for malformed <I>header-name-token</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3077">
     <td><a href="https://cplusplus.github.io/CWG/issues/3077.html">3077</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Undesirable formation of <TT>import</TT> directive with <I>string-literal</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3078">
     <td><a href="https://cplusplus.github.io/CWG/issues/3078.html">3078</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>review</td>
     <td>Different treatment of <TT>#include</TT> <I>pp-tokens</I> and <I>header-name-tokens</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3079">
     <td><a href="https://cplusplus.github.io/CWG/issues/3079.html">3079</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Allow <I>empty-declaration</I>s in anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3080">
     <td><a href="https://cplusplus.github.io/CWG/issues/3080.html">3080</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>tentatively ready</td>
     <td>Clarify kinds of permitted template template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3081">
     <td><a href="https://cplusplus.github.io/CWG/issues/3081.html">3081</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>review</td>
     <td>Require glvalue when splicing direct base class relationship</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3082">
     <td><a href="https://cplusplus.github.io/CWG/issues/3082.html">3082</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>tentatively ready</td>
     <td>Allow for call-compatible function types in <TT>reinterpret_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3083">
     <td><a href="https://cplusplus.github.io/CWG/issues/3083.html">3083</a></td>
+    <td>[<a href="https://wg21.link/stmt.pre">stmt.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Remove redundant restrictions on class and enum definitions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3084">
     <td><a href="https://cplusplus.github.io/CWG/issues/3084.html">3084</a></td>
+    <td>[<a href="https://wg21.link/stmt.cont">stmt.cont</a>]</td>
     <td>tentatively ready</td>
     <td><I>compound-statement</I>s inside <I>iteration-statement</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3085">
     <td><a href="https://cplusplus.github.io/CWG/issues/3085.html">3085</a></td>
+    <td>[<a href="https://wg21.link/stmt.pre">stmt.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Apply restriction inside for-range-declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3086">
     <td><a href="https://cplusplus.github.io/CWG/issues/3086.html">3086</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>tentatively ready</td>
     <td>Destringizing should consider all sorts of encoding-prefixes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3087">
     <td><a href="https://cplusplus.github.io/CWG/issues/3087.html">3087</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td>Destringizing for raw string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3088">
     <td><a href="https://cplusplus.github.io/CWG/issues/3088.html">3088</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>open</td>
     <td>Clarify macro treatment of identifiers with special meaning</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3089">
     <td><a href="https://cplusplus.github.io/CWG/issues/3089.html">3089</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>tentatively ready</td>
     <td>const-default-constructible improperly handles std::meta::info</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3090">
     <td><a href="https://cplusplus.github.io/CWG/issues/3090.html">3090</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>tentatively ready</td>
     <td>Internal linkage from header units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3091">
     <td><a href="https://cplusplus.github.io/CWG/issues/3091.html">3091</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>review</td>
     <td>Linking of translation units as sequences of tokens</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3092">
     <td><a href="https://cplusplus.github.io/CWG/issues/3092.html">3092</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.annotation">dcl.attr.annotation</a>]</td>
     <td>tentatively ready</td>
     <td><I>base-specifier</I>s are not "declared"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3093">
     <td><a href="https://cplusplus.github.io/CWG/issues/3093.html">3093</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.splice">expr.prim.splice</a>]</td>
     <td>open</td>
     <td>Missing integration of direct base class relationships</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3094">
     <td><a href="https://cplusplus.github.io/CWG/issues/3094.html">3094</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>review</td>
     <td>Rework phases for string literal concatenation and token formation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3095">
     <td><a href="https://cplusplus.github.io/CWG/issues/3095.html">3095</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>open</td>
     <td>Type-dependent packs that are not structured binding packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3096">
     <td><a href="https://cplusplus.github.io/CWG/issues/3096.html">3096</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependence of size of structured binding pack with non-dependent initializer</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3097">
     <td><a href="https://cplusplus.github.io/CWG/issues/3097.html">3097</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>tentatively ready</td>
     <td>Lambda expression introduces a scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3098">
     <td><a href="https://cplusplus.github.io/CWG/issues/3098.html">3098</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>tentatively ready</td>
     <td>Remove redundancy "names or designates"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3099">
     <td><a href="https://cplusplus.github.io/CWG/issues/3099.html">3099</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Instantiation of type aliases from alias templates is unspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3100">
     <td><a href="https://cplusplus.github.io/CWG/issues/3100.html">3100</a></td>
+    <td>[<a href="https://wg21.link/basic.start.term">basic.start.term</a>]</td>
     <td>open</td>
     <td>Destruction order for objects with static storage duration</td>
     <td align="center">Not resolved</td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 485a9a56267ca..3ba12e13a7354 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -10,26 +10,36 @@ output = os.path.join(clang_www_dir, 'cxx_dr_status.html')
 dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs')
 
 class DR:
-  def __init__(self, section, issue, url, status, title):
-    self.section, self.issue, self.url, self.status, self.title = \
-        section, issue, url, status, title
+  def __init__(self, *, section_number, section_name, section_link, number, url, status, liaison, title):
+    self.section_number, self.section_name, self.section_link, self.number, self.url, self.status, self.liaison, self.title = \
+        section_number, section_name, section_link, number, url, status, liaison, title
   def __repr__(self):
-    return '%s (%s): %s' % (self.issue, self.status, self.title)
-
-def parse(dr):
-  try:
-    section, issue_link, status, liaison, title = [
-        col.split('>', 1)[1].split('</TD>')[0]
-        for col in dr.split('</TR>', 1)[0].split('<TD')[1:]
-    ]
-  except Exception as ex:
-    print(f"Parse error: {ex}\n{dr}", file=sys.stderr)
-    sys.exit(1)
-  _, url, issue = issue_link.split('"', 2)
-  url = url.strip()
-  issue = int(issue.split('>', 1)[1].split('<', 1)[0])
-  title = title.replace('<issue_title>', '').replace('</issue_title>', '').replace('\r\n', '\n').strip()
-  return DR(section, issue, url, status, title)
+    return '%s (%s): %s' % (self.number, self.status, self.title)
+
+  pattern = re.compile('''
+<TD.*>(?P<section_number>.*) <A href="(?P<section_link>.*)">(?P<section_name>.*)</A>
+</TD>
+<TD.*><A HREF="(?P<url>.*)">(?P<number>.*)</A></TD>
+<TD.*>(?P<status>.*)</TD>
+<TD.*>(?P<liaison>.*)</TD>
+<TD.*><issue_title>(?P<title>[\\w\\W]*)</issue_title></TD>
+</TR>''')
+
+  @classmethod
+  def parse_from_html(cls, html_string):
+    match = cls.pattern.match(html_string)
+    if match is None:
+        print(f"Parse error: {html_string}", file=sys.stderr)
+        exit(1)
+    return cls(
+      section_number=match.group('section_number'),
+      section_name=match.group('section_name'),
+      section_link=match.group('section_link'),
+      number=int(match.group('number')),
+      url=match.group('url'),
+      status=match.group('status'),
+      liaison=match.group('liaison'),
+      title=match.group('title').replace('\n', ' ').strip())
 
 def collect_tests():
   status_re = re.compile(r'\bcwg([0-9]+): (.*)')
@@ -68,8 +78,8 @@ def get_issues(path):
      print(ex, file=sys.stderr)
      sys.exit(1)
 
-  return sorted((parse(dr) for dr in buffer.split('<TR>')[2:]),
-                key = lambda dr: dr.issue)
+  return sorted((DR.parse_from_html(dr) for dr in buffer.split('<TR>')[2:]),
+                key = lambda dr: dr.number)
 
 
 issue_list_path  = None
@@ -127,9 +137,10 @@ out_html.append('''\
 
 <p>This page tracks which C++ defect reports are implemented within Clang.</p>
 
-<table width="689" border="1" cellspacing="0">
+<table width="892" border="1" cellspacing="0">
   <tr>
     <th>Number</th>
+    <th>Section</th>
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
@@ -149,7 +160,7 @@ def availability(issue):
     unresolved_status = unresolved_status_match.group(1)
     proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
-      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
+      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.number, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
     status = status[:-1-len(proposed_resolution)]
     status = status[:-1-len(unresolved_status)]
@@ -236,7 +247,7 @@ def availability(issue):
     avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup)
     _, avail_style, _, _ = availability(dup)
   else:
-    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue))
+    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.number))
   return (avail + avail_suffix, avail_style, unresolved_status, details)
 
 count = {}
@@ -254,7 +265,7 @@ for dr in drs:
   elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'):
     row_style = ' class="open"'
     try:
-      avail, avail_style, unresolved_status, details = availability(dr.issue)
+      avail, avail_style, unresolved_status, details = availability(dr.number)
     except AvailabilityError as e:
       availability_error_occurred = True
       print(e.args[0])
@@ -267,12 +278,12 @@ for dr in drs:
       if unresolved_status != dr.status:
         availability_error_occurred = True
         print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \
-             % (dr.issue, unresolved_status, dr.status))
+             % (dr.number, unresolved_status, dr.status))
         continue
   else:
     row_style = ''
     try:
-      avail, avail_style, unresolved_status, details = availability(dr.issue)
+      avail, avail_style, unresolved_status, details = availability(dr.number)
     except AvailabilityError as e:
       availability_error_occurred = True
       print(e.args[0])
@@ -281,7 +292,7 @@ for dr in drs:
     if unresolved_status:
       availability_error_occurred = True
       print("error: issue %s is marked '%s', even though it is resolved in CWG index" \
-           % (dr.issue, unresolved_status))
+           % (dr.number, unresolved_status))
       continue
 
   if not avail.startswith('Sup') and not avail.startswith('Dup'):
@@ -297,8 +308,9 @@ for dr in drs:
         {details}
       </details>'''
   out_html.append(f'''
-  <tr{row_style} id="{dr.issue}">
-    <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td>
+  <tr{row_style} id="{dr.number}">
+    <td><a href="https://cplusplus.github.io/CWG/issues/{dr.number}.html">{dr.number}</a></td>
+    <td>[<a href="{dr.section_link}">{dr.section_name}</a>]</td>
     <td>{dr.status}</td>
     <td>{dr.title}</td>
     <td{avail_style} align="center">{avail}</td>

From bb9bd5f263226840194b28457ddf9861986db51f Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Mon, 3 Nov 2025 10:19:12 -0500
Subject: [PATCH 019/313] [LoopUnroll] Fix assert fail on zeroed branch weights
 (#165938)

BranchProbability fails an assert when its denominator is zero.

Reported at
<https://github.com/llvm/llvm-project/pull/159163#pullrequestreview-3406318423>.
---
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  5 +++-
 .../LoopUnroll/zeroed-branch-weights.ll       | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 8be471bee5579..6e60b94be78e3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B,
   uint64_t Weight0, Weight1;
   if (!extractBranchWeights(*B, Weight0, Weight1))
     return BranchProbability::getUnknown();
+  uint64_t Denominator = Weight0 + Weight1;
+  if (Denominator == 0)
+    return BranchProbability::getUnknown();
   if (!ForFirstTarget)
     std::swap(Weight0, Weight1);
-  return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1);
+  return BranchProbability::getBranchProbability(Weight0, Denominator);
 }
 
 bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
new file mode 100644
index 0000000000000..4d378b0d22f7d
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
@@ -0,0 +1,30 @@
+; Check that zeroed branch weights do not crash or otherwise break basic
+; LoopUnroll behavior when it tries to compute a probability from them.
+
+; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  br i1 false, label %end, label %loop, !prof !0
+
+end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 0, i32 0}
+
+; CHECK: define void @test() {
+; CHECK: entry:
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   br i1 false, label %end, label %loop.1, !prof !0
+; CHECK: loop.1:
+; CHECK:   br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1
+; CHECK-NOT: loop.2
+; CHECK: end:
+; CHECK:   ret void
+; CHECK: }
+; CHECK: !0 = !{!"branch_weights", i32 0, i32 0}

From 24c22b7de620669aed9da28de323309c44a58244 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 3 Nov 2025 09:27:45 -0600
Subject: [PATCH 020/313] [flang][OpenMP] Sort and move macro-based clause
 checks to the end, NFC (#166175)

---
 flang/lib/Semantics/check-omp-structure.cpp | 206 ++++++++++----------
 1 file changed, 100 insertions(+), 106 deletions(-)

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e7e3f4d886b34..d7db15dd37949 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -195,30 +195,6 @@ void OmpStructureChecker::Leave(const parser::ExecutionPart &) {
   partStack_.pop_back();
 }
 
-// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
-#define CHECK_SIMPLE_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-  }
-
-#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-    RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
-  }
-
-#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-    RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
-  }
-
-// Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'.
-#define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::X &) { \
-    CheckAllowedClause(llvm::omp::Y); \
-  }
-
 // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment
 // statements and the expressions enclosed in an OpenMP Workshare construct
 class OmpWorkshareBlockChecker {
@@ -3391,88 +3367,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) {
         /*paramName=*/"parameter", /*allowZero=*/false);
 }
 
-// Following clauses do not have a separate node in parse-tree.h.
-CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
-CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
-CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture)
-CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains)
-CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
-CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
-CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
-CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
-CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
-CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
-CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
-CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
-CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
-CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
-CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
-CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
-CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
-CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
-CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
-CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
-CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
-CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
-CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
-CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
-CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
-CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
-CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
-CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
-CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
-CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
-CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
-CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable)
-CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp)
-CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines)
-CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs)
-CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism)
-CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup)
-CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
-CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
-CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
-CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
-CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
-CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
-CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
-CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
-CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators)
-CHECK_SIMPLE_CLAUSE(Write, OMPC_write)
-CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
-CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
-CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
-CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
-CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
-CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
-CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
-CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
-CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
-CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
-CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
-CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
-CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
-CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
-CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
-CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
-CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
-CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
-CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
-CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
-CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
-CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
-CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
-
-CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams)
-CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads)
-CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem)
-CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority)
-CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit)
-
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
-
 void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
   context_.Say(GetContext().clauseSource,
       "LOOPRANGE clause is not implemented yet"_err_en_US,
@@ -5545,4 +5439,104 @@ void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) {
   }
 }
 
+// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
+#define CHECK_SIMPLE_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+  }
+
+#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+    RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
+  }
+
+#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+    RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
+  }
+
+// Following clauses do not have a separate node in parse-tree.h.
+CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
+CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
+CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
+CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
+CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
+CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
+CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
+CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture)
+CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
+CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains)
+CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
+CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
+CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
+CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
+CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
+CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
+CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
+CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
+CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
+CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
+CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
+CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
+CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
+CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
+CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
+CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
+CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
+CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
+CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
+CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
+CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
+CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
+CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
+CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
+CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable)
+CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
+CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
+CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup)
+CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
+CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs)
+CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp)
+CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines)
+CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism)
+CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
+CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
+CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
+CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
+CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
+CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
+CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
+CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
+CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
+CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
+CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
+CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
+CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
+CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
+CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
+CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
+CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
+CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
+CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
+CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
+CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
+CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
+CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
+CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
+CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators)
+CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
+CHECK_SIMPLE_CLAUSE(Write, OMPC_write)
+
+CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams)
+CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads)
+CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem)
+CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority)
+CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit)
+
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
+
 } // namespace Fortran::semantics

From 47c54d55c9fac5ea7c87881e00f96e8c12b18174 Mon Sep 17 00:00:00 2001
From: Matthew Nagy <matthew.nagy@sony.com>
Date: Mon, 3 Nov 2025 15:37:26 +0000
Subject: [PATCH 021/313] [UBSan] Improve error message when a misalignment is
 due to target default assumed alignment

---
 clang/lib/CodeGen/CGExprCXX.cpp               | 21 ++++++++---
 clang/lib/CodeGen/CodeGenFunction.h           |  5 ++-
 compiler-rt/lib/ubsan/ubsan_checks.inc        |  1 +
 compiler-rt/lib/ubsan/ubsan_handlers.cpp      | 33 +++++++++++++----
 .../TestCases/TypeCheck/minimum-alignment.cpp | 36 +++++++++++++++++++
 .../ubsan/TestCases/TypeCheck/misaligned.cpp  |  2 +-
 6 files changed, 86 insertions(+), 12 deletions(-)
 create mode 100644 compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp

diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 14d8db32bafc6..f2dd22e9bed3b 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -18,6 +18,9 @@
 #include "ConstantEmitter.h"
 #include "TargetInfo.h"
 #include "clang/Basic/CodeGenOptions.h"
+#include "clang/Basic/Sanitizers.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "llvm/IR/Intrinsics.h"
 
@@ -1749,6 +1752,17 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
       allocator->isReservedGlobalPlacementOperator())
     result = Builder.CreateLaunderInvariantGroup(result);
 
+  // Check the default alignment of the type and why. Users may incorrectly
+  // return misaligned memory from a replaced operator new without knowing
+  // about default alignment.
+  TypeCheckKind checkKind = CodeGenFunction::TCK_ConstructorCall;
+  const TargetInfo &TI = getContext().getTargetInfo();
+  unsigned DefaultTargetAlignment = TI.getNewAlign() / TI.getCharWidth();
+  if (SanOpts.has(SanitizerKind::Alignment) &&
+      (DefaultTargetAlignment >
+       CGM.getContext().getTypeAlignInChars(allocType).getQuantity()))
+    checkKind = CodeGenFunction::TCK_ConstructorCallMinimumAlign;
+
   // Emit sanitizer checks for pointer value now, so that in the case of an
   // array it was checked only once and not at each constructor call. We may
   // have already checked that the pointer is non-null.
@@ -1756,10 +1770,9 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
   // we'll null check the wrong pointer here.
   SanitizerSet SkippedChecks;
   SkippedChecks.set(SanitizerKind::Null, nullCheck);
-  EmitTypeCheck(CodeGenFunction::TCK_ConstructorCall,
-                E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(),
-                result, allocType, result.getAlignment(), SkippedChecks,
-                numElements);
+  EmitTypeCheck(
+      checkKind, E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(),
+      result, allocType, result.getAlignment(), SkippedChecks, numElements);
 
   EmitNewInitializer(*this, E, allocType, elementTy, result, numElements,
                      allocSizeWithoutCookie);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 8c4c1c8c2dc95..047ca844c79de 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3296,7 +3296,10 @@ class CodeGenFunction : public CodeGenTypeCache {
     TCK_NonnullAssign,
     /// Checking the operand of a dynamic_cast or a typeid expression.  Must be
     /// null or an object within its lifetime.
-    TCK_DynamicOperation
+    TCK_DynamicOperation,
+    /// Checking the 'this' poiner for a constructor call, including that the
+    /// alignment is greater or equal to the targets minimum alignment
+    TCK_ConstructorCallMinimumAlign
   };
 
   /// Determine whether the pointer type check \p TCK permits null pointers.
diff --git a/compiler-rt/lib/ubsan/ubsan_checks.inc b/compiler-rt/lib/ubsan/ubsan_checks.inc
index b1d09a9024e7e..f8757d781afb8 100644
--- a/compiler-rt/lib/ubsan/ubsan_checks.inc
+++ b/compiler-rt/lib/ubsan/ubsan_checks.inc
@@ -28,6 +28,7 @@ UBSAN_CHECK(NullptrAfterNonZeroOffset, "nullptr-after-nonzero-offset",
 UBSAN_CHECK(PointerOverflow, "pointer-overflow", "pointer-overflow")
 UBSAN_CHECK(MisalignedPointerUse, "misaligned-pointer-use", "alignment")
 UBSAN_CHECK(AlignmentAssumption, "alignment-assumption", "alignment")
+UBSAN_CHECK(MinumumAssumedAlignment, "minimum-assumed-alignment", "alignment")
 UBSAN_CHECK(InsufficientObjectSize, "insufficient-object-size", "object-size")
 UBSAN_CHECK(SignedIntegerOverflow, "signed-integer-overflow",
             "signed-integer-overflow")
diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
index 63319f46734a4..fc6063af4562b 100644
--- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
@@ -73,14 +73,26 @@ enum TypeCheckKind {
   TCK_NonnullAssign,
   /// Checking the operand of a dynamic_cast or a typeid expression.  Must be
   /// null or an object within its lifetime.
-  TCK_DynamicOperation
+  TCK_DynamicOperation,
+  /// Checking the 'this' poiner for a constructor call, including that the
+  /// alignment is greater or equal to the targets minimum alignment
+  TCK_ConstructorCallMinimumAlign
 };
 
 extern const char *const TypeCheckKinds[] = {
-    "load of", "store to", "reference binding to", "member access within",
-    "member call on", "constructor call on", "downcast of", "downcast of",
-    "upcast of", "cast to virtual base of", "_Nonnull binding to",
-    "dynamic operation on"};
+    "load of",
+    "store to",
+    "reference binding to",
+    "member access within",
+    "member call on",
+    "constructor call on",
+    "downcast of",
+    "downcast of",
+    "upcast of",
+    "cast to virtual base of",
+    "_Nonnull binding to",
+    "dynamic operation on",
+    "constructor call with pointer from operator new on"};
 }
 
 static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer,
@@ -94,7 +106,9 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer,
              ? ErrorType::NullPointerUseWithNullability
              : ErrorType::NullPointerUse;
   else if (Pointer & (Alignment - 1))
-    ET = ErrorType::MisalignedPointerUse;
+    ET = (Data->TypeCheckKind == TCK_ConstructorCallMinimumAlign)
+             ? ErrorType::MinumumAssumedAlignment
+             : ErrorType::MisalignedPointerUse;
   else
     ET = ErrorType::InsufficientObjectSize;
 
@@ -117,6 +131,13 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer,
     Diag(Loc, DL_Error, ET, "%0 null pointer of type %1")
         << TypeCheckKinds[Data->TypeCheckKind] << Data->Type;
     break;
+  case ErrorType::MinumumAssumedAlignment:
+    Diag(Loc, DL_Error, ET,
+         "%0 misaligned address %1 for type %2, "
+         "which requires target minimum assumed %3 byte alignment")
+        << TypeCheckKinds[Data->TypeCheckKind] << (void *)Pointer << Data->Type
+        << Alignment;
+    break;
   case ErrorType::MisalignedPointerUse:
     Diag(Loc, DL_Error, ET, "%0 misaligned address %1 for type %3, "
                         "which requires %2 byte alignment")
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp
new file mode 100644
index 0000000000000..4642126ab74c4
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp
@@ -0,0 +1,36 @@
+// RUN: %clangxx %gmlt -fsanitize=alignment %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: i386
+// UNSUPPORTED: armv7l
+
+// These sanitizers already overload the new operator so won't compile this test
+// UNSUPPORTED: ubsan-msan
+// UNSUPPORTED: ubsan-tsan
+
+#include <cassert>
+#include <cstdlib>
+
+void *operator new(std::size_t count) {
+  constexpr const size_t offset = 8;
+
+  // allocate a bit more so we can safely offset it
+  void *ptr = std::malloc(count + offset);
+
+  // verify malloc returned 16 bytes aligned mem
+  static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ == 16);
+  assert(((std::ptrdiff_t)ptr & (__STDCPP_DEFAULT_NEW_ALIGNMENT__ - 1)) == 0);
+
+  return (char *)ptr + offset;
+}
+
+struct Foo {
+  void *_cookie1, *_cookie2;
+};
+
+static_assert(alignof(Foo) == 8);
+int main() {
+  // CHECK: runtime error: constructor call with pointer from operator new on misaligned address 0x{{.*}} for type 'Foo', which requires target minimum assumed 16 byte alignment
+  Foo *f = new Foo;
+  return 0;
+}
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
index e39a0ab4e6589..4b0b2b5923c6f 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
@@ -101,7 +101,7 @@ int main(int, char **argv) {
     return s->f() && 0;
 
   case 'n':
-    // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires 4 byte alignment
+    // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call with pointer from operator new on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires target minimum assumed 4 byte alignment
     // CHECK-NEW-NEXT: [[PTR]]: note: pointer points here
     // CHECK-NEW-NEXT: {{^ 00 00 00 01 02 03 04  05}}
     // CHECK-NEW-NEXT: {{^             \^}}

From 613c6de977d2917ef9f6c3d14522656f31448f4d Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 3 Nov 2025 16:42:34 +0100
Subject: [PATCH 022/313] [clang] Adjust TextDiagnostic style ranges for
 interesting source region (#164941)

After:
<img width="1904" height="186" alt="Screenshot From 2025-10-24 09-59-40"
src="https://github.com/user-attachments/assets/c860227f-50c5-4afe-a959-83e3452fc72d"
/>
<img width="1366" height="204" alt="Screenshot From 2025-10-24 09-59-12"
src="https://github.com/user-attachments/assets/450bffec-b4b2-465c-b435-bddf8ebdbd32"
/>
<img width="1310" height="204" alt="Screenshot From 2025-10-24 09-58-53"
src="https://github.com/user-attachments/assets/8015ec6f-e032-4f0b-b55c-b2c718d14f6b"
/>
---
 clang/lib/Frontend/TextDiagnostic.cpp         | 76 ++++++++++++++-----
 ...diags-interesting-source-region-colors.cpp | 30 ++++++++
 2 files changed, 87 insertions(+), 19 deletions(-)
 create mode 100644 clang/test/Frontend/diags-interesting-source-region-colors.cpp

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index aea3e72d92a84..10032184b5d94 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -349,14 +349,13 @@ struct SourceColumnMap {
 
 /// When the source code line we want to print is too long for
 /// the terminal, select the "interesting" region.
-static void selectInterestingSourceRegion(std::string &SourceLine,
-                                          std::string &CaretLine,
-                                          std::string &FixItInsertionLine,
-                                          Columns NonGutterColumns,
-                                          const SourceColumnMap &Map) {
-  Columns CaretColumns = Columns(CaretLine.size());
-  Columns FixItColumns =
-      Columns(llvm::sys::locale::columnWidth(FixItInsertionLine));
+static void selectInterestingSourceRegion(
+    std::string &SourceLine, std::string &CaretLine,
+    std::string &FixItInsertionLine, Columns NonGutterColumns,
+    const SourceColumnMap &Map,
+    SmallVectorImpl<clang::TextDiagnostic::StyleRange> &Styles) {
+  Columns CaretColumns = CaretLine.size();
+  Columns FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine);
   Columns MaxColumns =
       std::max({Map.columns().V, CaretColumns.V, FixItColumns.V});
   // if the number of columns is less than the desired number we're done
@@ -369,13 +368,11 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   // Find the slice that we need to display the full caret line
   // correctly.
   Columns CaretStart = 0, CaretEnd = CaretLine.size();
-  for (; CaretStart != CaretEnd; CaretStart = CaretStart.next())
-    if (!isWhitespace(CaretLine[CaretStart.V]))
-      break;
+  while (CaretStart != CaretEnd && isWhitespace(CaretLine[CaretStart.V]))
+    CaretStart = CaretStart.next();
 
-  for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev())
-    if (!isWhitespace(CaretLine[CaretEnd.V - 1]))
-      break;
+  while (CaretEnd != CaretStart && isWhitespace(CaretLine[CaretEnd.V]))
+    CaretEnd = CaretEnd.prev();
 
   // caret has already been inserted into CaretLine so the above whitespace
   // check is guaranteed to include the caret
@@ -516,13 +513,45 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved >
          NonGutterColumns);
 
+  // Since we've modified the SourceLine, we also need to adjust the line's
+  // highlighting information. In particular, if we've removed
+  // from the front of the line, we need to move the style ranges to the
+  // left and remove unneeded ranges.
+  // Note in particular that variables like CaretEnd are defined in the
+  // CaretLine, which only contains ASCII, while the style ranges are defined in
+  // the source line, where we have to care for the byte-index != column-index
+  // case.
+  Bytes BytesRemoved =
+      FrontColumnsRemoved > FrontEllipse.size()
+          ? (Map.columnToByte(FrontColumnsRemoved) - Bytes(FrontEllipse.size()))
+          : 0;
+  Bytes CodeEnd =
+      CaretEnd < Map.columns() ? Map.columnToByte(CaretEnd.V) : CaretEnd.V;
+  for (TextDiagnostic::StyleRange &R : Styles) {
+    // Remove style ranges before and after the new truncated snippet.
+    if (R.Start >= static_cast<unsigned>(CodeEnd.V) ||
+        R.End < static_cast<unsigned>(BytesRemoved.V)) {
+      R.Start = R.End = std::numeric_limits<int>::max();
+      continue;
+    }
+    // Move them left. (Note that this can wrap R.Start, but that doesn't
+    // matter).
+    R.Start -= BytesRemoved.V;
+    R.End -= BytesRemoved.V;
+
+    // Don't leak into the ellipse at the end.
+    if (R.Start < static_cast<unsigned>(CodeEnd.V) &&
+        R.End > static_cast<unsigned>(CodeEnd.V))
+      R.End = CodeEnd.V + 1; // R.End is inclusive.
+  }
+
   // The line needs some truncation, and we'd prefer to keep the front
   //  if possible, so remove the back
   if (BackColumnsRemoved > Columns(BackEllipse.size()))
     SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse);
 
   // If that's enough then we're done
-  if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns))
+  if (FrontColumnsRemoved + ColumnsKept <= NonGutterColumns)
     return;
 
   // Otherwise remove the front as well
@@ -1391,6 +1420,11 @@ void TextDiagnostic::emitSnippetAndCaret(
       OS.indent(MaxLineNoDisplayWidth + 2) << "| ";
   };
 
+  Columns MessageLength = DiagOpts.MessageLength;
+  // If we don't have enough columns available, just abort now.
+  if (MessageLength != 0 && MessageLength <= Columns(MaxLineNoDisplayWidth + 4))
+    return;
+
   // Prepare source highlighting information for the lines we're about to
   // emit, starting from the first line.
   std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
@@ -1450,10 +1484,14 @@ void TextDiagnostic::emitSnippetAndCaret(
 
     // If the source line is too long for our terminal, select only the
     // "interesting" source region within that line.
-    Columns MessageLength = DiagOpts.MessageLength;
-    if (MessageLength.V != 0)
+    if (MessageLength != 0) {
+      Columns NonGutterColumns = MessageLength;
+      if (MaxLineNoDisplayWidth != 0)
+        NonGutterColumns -= Columns(MaxLineNoDisplayWidth + 4);
       selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine,
-                                    MessageLength, SourceColMap);
+                                    NonGutterColumns, SourceColMap,
+                                    SourceStyles[LineNo - Lines.first]);
+    }
 
     // If we are in -fdiagnostics-print-source-range-info mode, we are trying
     // to produce easily machine parsable output.  Add a space before the
@@ -1508,7 +1546,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
   // Print the source line one character at a time.
   bool PrintReversed = false;
   std::optional<llvm::raw_ostream::Colors> CurrentColor;
-  size_t I = 0;
+  size_t I = 0; // Bytes.
   while (I < SourceLine.size()) {
     auto [Str, WasPrintable] =
         printableTextForNextCharacter(SourceLine, &I, DiagOpts.TabStop);
diff --git a/clang/test/Frontend/diags-interesting-source-region-colors.cpp b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
new file mode 100644
index 0000000000000..80db0873b9e0a
--- /dev/null
+++ b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
@@ -0,0 +1,30 @@
+// RUN: not %clang_cc1 %s -fmessage-length=40 -fcolor-diagnostics -fno-show-source-location -Wunused-value -o - 2>&1 | FileCheck %s
+
+// REQUIRES: ansi-escape-sequences
+
+int main() {
+          1 +                                                        + if;
+  // CHECK: expected expression
+  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
+
+    /*😂*/1 +                                                        + if;
+  // CHECK: expected expression
+  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
+
+  a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
+  // CHECK: use of undeclared identifier
+  // CHECK-NEXT: a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
+
+
+  /*😂😂😂*/ a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
+  // CHECK: use of undeclared identifier
+  // CHECK-NEXT: [[YELLOW:.\[0;33m]]/*😂😂😂*/[[RESET]] a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
+
+  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+  // CHECK: [[GREEN:.\[0;32m]]"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
+
+  "😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+  // CHECK: [[GREEN:.\[0;32m]]"😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
+}
+
+

From 28d3194d113fbbe0534fe81398045e2d56d40886 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 3 Nov 2025 16:47:12 +0100
Subject: [PATCH 023/313] [libc++] Merge
 basic_string::__{replace,reset}_internal_buffer (#165404)

---
 libcxx/include/string | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index 8f80afbc2fd37..33382c7af4b2c 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -914,6 +914,10 @@ private:
   union __rep {
     __short __s;
     __long __l;
+
+    __rep() = default;
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {}
   };
 
   _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_);
@@ -2259,18 +2263,12 @@ private:
     return __long(__buffer, __capacity);
   }
 
-  // Deallocate the long buffer if it exists and clear the short buffer so we are an empty string
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer() {
+  // Replace the current buffer with __new_rep. Deallocate the old long buffer if it exists.
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer(__rep __new_rep = __short()) {
     __annotate_delete();
     if (__is_long())
       __alloc_traits::deallocate(__alloc_, __get_long_pointer(), __get_long_cap());
-    __rep_.__s = __short();
-  }
-
-  // Replace the current buffer with __alloc; the first __size elements constitute a string
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __replace_internal_buffer(__long __alloc) {
-    __reset_internal_buffer();
-    __rep_.__l = __alloc;
+    __rep_ = __new_rep;
   }
 
   // Initialize the internal buffer to hold __size elements
@@ -2444,7 +2442,7 @@ private:
         __annotate_delete();
         auto __guard = std::__make_scope_guard(__annotate_new_size(*this));
         auto __alloc = __str.__alloc_;
-        __replace_internal_buffer(__allocate_long_buffer(__alloc, __str.size()));
+        __reset_internal_buffer(__allocate_long_buffer(__alloc, __str.size()));
         __alloc_ = std::move(__alloc);
       }
     }
@@ -2710,7 +2708,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__
                       __sec_cp_sz);
   __buffer.__size_ = __n_copy + __n_add + __sec_cp_sz;
   traits_type::assign(__buffer.__data_[__buffer.__size_], value_type());
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 // __grow_by is deprecated because it does not set the size. It may not update the size when the size is changed, and it
@@ -2746,7 +2744,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
   // This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size
   // at all.
   __buffer.__size_ = -1;
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3394,7 +3392,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re
   __long __buffer  = __allocate_long_buffer(__alloc_, __requested_capacity);
   __buffer.__size_ = size();
   traits_type::copy(std::__to_address(__buffer.__data_), data(), __buffer.__size_ + 1);
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3433,7 +3431,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat
     }
 
     traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__get_long_pointer()), __size + 1);
-    __replace_internal_buffer(__buffer);
+    __reset_internal_buffer(__buffer);
 #  if _LIBCPP_HAS_EXCEPTIONS
   } catch (...) {
     return;

From 3bd59634d2d21328e32c06ecd6a7a1188a1b1993 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Mon, 3 Nov 2025 09:57:21 -0600
Subject: [PATCH 024/313] [MLIR][SCF] Speed up ConditionPropagation (#166080)

Introduce a cache to avoid looking up then/else region nesting through
`isAncestor` calls repeatedly. This gets expensive for large inputs with
lots of pointer chasing.

Fixes https://github.com/llvm/llvm-project/issues/166039
---
 mlir/lib/Dialect/SCF/IR/SCF.cpp | 47 ++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 2946b53c8cb36..881e256a8797b 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -2565,6 +2565,39 @@ struct ConvertTrivialIfToSelect : public OpRewritePattern<IfOp> {
 struct ConditionPropagation : public OpRewritePattern<IfOp> {
   using OpRewritePattern<IfOp>::OpRewritePattern;
 
+  /// Kind of parent region in the ancestor cache.
+  enum class Parent { Then, Else, None };
+
+  /// Returns the kind of region ("then", "else", or "none") of the
+  /// IfOp that the given region is transitively nested in. Updates
+  /// the cache accordingly.
+  static Parent getParentType(Region *toCheck, IfOp op,
+                              DenseMap<Region *, Parent> &cache,
+                              Region *endRegion) {
+    SmallVector<Region *> seen;
+    while (toCheck != endRegion) {
+      auto found = cache.find(toCheck);
+      if (found != cache.end())
+        return found->second;
+      seen.push_back(toCheck);
+      if (&op.getThenRegion() == toCheck) {
+        for (Region *region : seen)
+          cache[region] = Parent::Then;
+        return Parent::Then;
+      }
+      if (&op.getElseRegion() == toCheck) {
+        for (Region *region : seen)
+          cache[region] = Parent::Else;
+        return Parent::Else;
+      }
+      toCheck = toCheck->getParentRegion();
+    }
+
+    for (Region *region : seen)
+      cache[region] = Parent::None;
+    return Parent::None;
+  }
+
   LogicalResult matchAndRewrite(IfOp op,
                                 PatternRewriter &rewriter) const override {
     // Early exit if the condition is constant since replacing a constant
@@ -2580,9 +2613,12 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
     Value constantTrue = nullptr;
     Value constantFalse = nullptr;
 
+    DenseMap<Region *, Parent> cache;
     for (OpOperand &use :
          llvm::make_early_inc_range(op.getCondition().getUses())) {
-      if (op.getThenRegion().isAncestor(use.getOwner()->getParentRegion())) {
+      switch (getParentType(use.getOwner()->getParentRegion(), op, cache,
+                            op.getCondition().getParentRegion())) {
+      case Parent::Then: {
         changed = true;
 
         if (!constantTrue)
@@ -2591,8 +2627,9 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantTrue); });
-      } else if (op.getElseRegion().isAncestor(
-                     use.getOwner()->getParentRegion())) {
+        break;
+      }
+      case Parent::Else: {
         changed = true;
 
         if (!constantFalse)
@@ -2601,6 +2638,10 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantFalse); });
+        break;
+      }
+      case Parent::None:
+        break;
       }
     }
 

From bf5332cd8266cddb3bfb2e8436a2161c10602855 Mon Sep 17 00:00:00 2001
From: Konrad Kleine <kkleine@redhat.com>
Date: Mon, 3 Nov 2025 17:00:06 +0100
Subject: [PATCH 025/313] [flang][driver] Bring --gcc-triple to flang (#165886)

When there are multiple gcc versions installed, we want `flang` to be
able
to find the right one based on the triple. Here's `flang` selecting an
unwanted `gcc` candidate installation, namely
`/usr/lib/gcc/x86_64-linux-gnu/15`:

```
~/src/llvm-project/main/build-RelWithDebInfo > ./bin/flang -v
flang version 22.0.0custombuild
Target: x86_64-redhat-linux-gnu
Thread model: posix
InstalledDir: /home/fedora/src/llvm-project/main/build-RelWithDebInfo/bin
System configuration file directory: /etc/clang/
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/15
Found candidate GCC installation: /usr/lib/gcc/x86_64-redhat-linux/15
Selected GCC installation: /usr/lib/gcc/x86_64-linux-gnu/15
Candidate multilib: .;@m64
Candidate multilib: 32;@m32
Selected multilib: .;@m64
```

When passing `--gcc-triple=x86_64-redhat-linux` we get the desired gcc
candidate installation:

```
~/src/llvm-project/main/build-RelWithDebInfo > ./bin/flang --gcc-triple=x86_64-redhat-linux -v
flang version 22.0.0custombuild
Target: x86_64-redhat-linux-gnu
Thread model: posix
InstalledDir: /home/fedora/src/llvm-project/main/build-RelWithDebInfo/bin
System configuration file directory: /etc/clang/
Found candidate GCC installation: /usr/lib/gcc/x86_64-redhat-linux/15
Selected GCC installation: /usr/lib/gcc/x86_64-redhat-linux/15
Candidate multilib: .;@m64
Candidate multilib: 32;@m32
Selected multilib: .;@m64
```

* Test: `LIT_FILTER="Flang :: Driver/gcc-triple.f90" ninja check-flang`
* Copied `flang/test/Driver/Inputs/fedora_39_tree` from
`clang/test/Driver/Inputs/fedora_39_tree`.
  * Testing what default triple is selected when two are possible.
  * Testing that we can select an existing triple.
  * Testing that triple is not selected if it doesn't exist.
---
 clang/include/clang/Driver/Options.td          |  1 +
 .../usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o |  0
 .../usr/lib/gcc/x86_64-linux-gnu/13/crtend.o   |  0
 .../usr/lib/gcc/x86_64-linux-gnu/13/crti.o     |  0
 .../usr/lib/gcc/x86_64-linux-gnu/13/crtn.o     |  0
 .../lib/gcc/x86_64-redhat-linux/13/crtbegin.o  |  0
 .../lib/gcc/x86_64-redhat-linux/13/crtend.o    |  0
 .../usr/lib/gcc/x86_64-redhat-linux/13/crti.o  |  0
 .../usr/lib/gcc/x86_64-redhat-linux/13/crtn.o  |  0
 flang/test/Driver/gcc-triple.f90               | 18 ++++++++++++++++++
 10 files changed, 19 insertions(+)
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o
 create mode 100644 flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o
 create mode 100644 flang/test/Driver/gcc-triple.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 20955ef1b852e..4778b87b789a9 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -741,6 +741,7 @@ def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>,
     "Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. "
     "Flang will use the GCC installation with the largest version">;
 def gcc_triple_EQ : Joined<["--"], "gcc-triple=">,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Search for the GCC installation with the specified triple.">;
 def CC : Flag<["-"], "CC">, Visibility<[ClangOption, CC1Option]>,
   Group<Preprocessor_Group>,
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90
new file mode 100644
index 0000000000000..027d78a7c5046
--- /dev/null
+++ b/flang/test/Driver/gcc-triple.f90
@@ -0,0 +1,18 @@
+!! UNSUPPORTED: system-windows
+
+!! Test that --gcc-triple option is working as expected.
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE
+! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
+! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13
+! DEFAULT_TRIPLE: {{^}}Selected GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-redhat-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_EXISTS
+! TRIPLE_EXISTS: {{^}}Selected GCC installation:
+! TRIPLE_EXISTS: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-foo-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_DOES_NOT_EXISTS
+! TRIPLE_DOES_NOT_EXISTS-NOT: x86_64-foo-linux
\ No newline at end of file

From 7d5659083cb21722416b38fe92b7200fe89be232 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 3 Nov 2025 06:25:20 -0800
Subject: [PATCH 026/313] [SLP]Do not create copyable node, if parent node is
 non-schedulable and has a use in binop.

If the parent node is non-schedulable (only externally used instructions), and at least one instruction has multiple uses and used in the binop, such copyable node should be created. Otherwise, it may contain wrong def-use chain model, which cannot be effective detected.

Fixes #166035
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 21 ++++++++++
 .../parent-non-schedule-multi-use-in-binop.ll | 40 +++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 34b405ced8c0a..bf3f52c51b64c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   if (isa<PHINode>(S.getMainOp()) ||
       isVectorLikeInstWithConstOps(S.getMainOp()))
     return nullptr;
+  // If the parent node is non-schedulable and the current node is copyable, and
+  // any of parent instructions are used outside several basic blocks or in
+  // bin-op node - cancel scheduling, it may cause wrong def-use deps in
+  // analysis, leading to a crash.
+  // Non-scheduled nodes may not have related ScheduleData model, which may lead
+  // to a skipped dep analysis.
+  if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+      EI.UserTE->doesNotNeedToSchedule() &&
+      EI.UserTE->getOpcode() != Instruction::PHI &&
+      any_of(EI.UserTE->Scalars, [](Value *V) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (!I || I->hasOneUser())
+          return false;
+        for (User *U : I->users()) {
+          auto *UI = cast<Instruction>(U);
+          if (isa<BinaryOperator>(UI))
+            return true;
+        }
+        return false;
+      }))
+    return std::nullopt;
   bool HasCopyables = S.areInstructionsWithCopyableElements();
   if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
        all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
new file mode 100644
index 0000000000000..590b0be973002
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@a = common global [100 x i64] zeroinitializer, align 64
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]]
+; CHECK:       [[LOP_RHSCNT_I_PEEL]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0>
+; CHECK-NEXT:    br label %[[LAND_END_I_PEEL]]
+; CHECK:       [[LAND_END_I_PEEL]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+  %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+  %0 = add i64 %.promoted104.i, 1
+  %1 = add i64 %.promoted103.i, 1
+  %2 = add i64 %0, 1
+  br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel
+
+lop.rhscnt.i.peel:
+  %3 = or i64 %1, 1
+  br label %land.end.i.peel
+
+land.end.i.peel:
+  %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ]
+  %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ]
+  store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+  store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+  ret void
+}

From 5b7a5f7c5f3c1dcd8ea9debf56983b85e393bc86 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 3 Nov 2025 16:03:54 +0000
Subject: [PATCH 027/313] [X86] Narrow BT/BTC/BTR/BTS compare + RMW patterns on
 very large integers (REAPPLIED) (#166176)

This patch allows us to narrow single bit-test/twiddle operations for
larger than legal scalar integers to efficiently operate just on the i32
sub-integer block actually affected.

The BITOP(X,SHL(1,IDX)) patterns are split, with the IDX used to access
the specific i32 block as well as specific bit within that block.

BT comparisons are relatively simple, and builds on the truncated
shifted loads fold from #165266.

BTC/BTR/BTS bit twiddling patterns need to match the entire RMW pattern
to safely confirm only one block is affected, but a similar approach is
taken and creates codegen that should allow us to further merge with
matching BT opcodes in a future patch (see #165291).

The resulting codegen is notably more efficient than the heavily
micro-coded memory folded variants of BT/BTC/BTR/BTS.

There is still some work to improve the bit insert 'init' patterns
included in bittest-big-integer.ll but I'm expecting this to be a
straightforward future extension.

REAPPLIED from #165540 which was reverted due to a sanitizer regression
that should have been fixed by #166160

Fixes #164225
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  114 +-
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 7521 +++---------------
 2 files changed, 1208 insertions(+), 6427 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e5b2743f602da..2970cf42df731 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53347,6 +53347,80 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Look for a RMW operation that only touches one bit of a larger than legal
+// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
+                              SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
+
+  // Only handle normal stores and its chain was a matching normal load.
+  auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
+  if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
+      !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+      Ld->getBasePtr() != St->getBasePtr() ||
+      Ld->getOffset() != St->getOffset())
+    return SDValue();
+
+  SDValue LoadVal(Ld, 0);
+  SDValue StoredVal = St->getValue();
+  EVT VT = StoredVal.getValueType();
+
+  // Only narrow larger than legal scalar integers.
+  if (!VT.isScalarInteger() ||
+      VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
+    return SDValue();
+
+  // BTR: X & ~(1 << ShAmt)
+  // BTS: X | (1 << ShAmt)
+  // BTC: X ^ (1 << ShAmt)
+  SDValue ShAmt;
+  if (!StoredVal.hasOneUse() ||
+      !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+                                  m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
+        sd_match(StoredVal,
+                 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+        sd_match(StoredVal,
+                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+    return SDValue();
+
+  // Ensure the shift amount is in bounds.
+  KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+  if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
+    return SDValue();
+
+  // Split the shift into an alignment shift that moves the active i32 block to
+  // the bottom bits for truncation and a modulo shift that can act on the i32.
+  EVT AmtVT = ShAmt.getValueType();
+  SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+                                 DAG.getSignedConstant(-32LL, DL, AmtVT));
+  SDValue ModuloAmt =
+      DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+
+  // Compute the byte offset for the i32 block that is changed by the RMW.
+  // combineTruncate will adjust the load for us in a similar way.
+  EVT PtrVT = St->getBasePtr().getValueType();
+  SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT);
+  SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+                                   DAG.getShiftAmountConstant(3, PtrVT, DL));
+  SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL,
+                                            SDNodeFlags::NoUnsignedWrap);
+
+  // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
+  SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+  X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+
+  SDValue Mask =
+      DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
+                  DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
+  if (StoredVal.getOpcode() == ISD::AND)
+    Mask = DAG.getNOT(DL, Mask, MVT::i32);
+
+  SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+                      Align(), St->getMemOperand()->getFlags());
+}
+
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
@@ -53573,6 +53647,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+    return R;
+
   // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
   //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
   if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -54505,8 +54582,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   // truncation, see if we can convert the shift into a pointer offset instead.
   // Limit this to normal (non-ext) scalar integer loads.
   if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
-      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
-      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+      Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
+      (Src.getOperand(0).hasOneUse() ||
+       !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {
     auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
     if (Ld->isSimple() && VT.isByteSized() &&
         isPowerOf2_64(VT.getSizeInBits())) {
@@ -56305,6 +56383,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   const SDValue LHS = N->getOperand(0);
   const SDValue RHS = N->getOperand(1);
@@ -56363,6 +56442,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
 
+      // If we're performing a bit test on a larger than legal type, attempt
+      // to (aligned) shift down the value to the bottom 32-bits and then
+      // perform the bittest on the i32 value.
+      // ICMP_ZERO(AND(X,SHL(1,IDX)))
+      // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31))))
+      if (isNullConstant(RHS) &&
+          OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) {
+        SDValue X, ShAmt;
+        if (sd_match(LHS, m_OneUse(m_And(m_Value(X),
+                                         m_Shl(m_One(), m_Value(ShAmt)))))) {
+          // Only attempt this if the shift amount is known to be in bounds.
+          KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+          if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) {
+            EVT AmtVT = ShAmt.getValueType();
+            SDValue AlignAmt =
+                DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+                            DAG.getSignedConstant(-32LL, DL, AmtVT));
+            SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+                                            DAG.getConstant(31, DL, AmtVT));
+            SDValue Mask = DAG.getNode(
+                ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
+                DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
+            X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt);
+            X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+            X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask);
+            return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32),
+                                CC);
+          }
+        }
+      }
+
       // cmpeq(trunc(x),C) --> cmpeq(x,C)
       // cmpne(trunc(x),C) --> cmpne(x,C)
       // iff x upper bits are zero.
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 8007d9dcf13bc..c311ab869c311 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB5_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:  .LBB5_2:
-; X86-NEXT:    andl 4(%eax), %esi
-; X86-NEXT:    andl (%eax), %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $32, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ne_i64:
@@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB6_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl %eax, %ebp
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i64:
@@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB7_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:  .LBB7_2:
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    notl %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    andl %esi, %ebp
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %ecx, %edi
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    sete %al
-; X86-NEXT:    movl %esi, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i64:
@@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB8_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB8_2:
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl %eax, %ebp
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: set_ne_i64:
@@ -419,52 +353,47 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    je .LBB9_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    notl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    notl %esi
+; X86-NEXT:    notl %edx
 ; X86-NEXT:    je .LBB9_4
 ; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl (%edi), %ecx
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    andl %ecx, %ebp
-; X86-NEXT:    orl %esi, %ebp
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %ebp, (%edi)
-; X86-NEXT:    movl %ebx, 4(%edi)
-; X86-NEXT:    sete %al
+; X86-NEXT:    andl 4(%ebx), %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    andl (%ebx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $32, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl (%ebx,%eax), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    movl %edx, (%ebx)
+; X86-NEXT:    movl %esi, 4(%ebx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i64:
@@ -516,112 +445,160 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $96, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $96, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ld = load i128, ptr %word
+  %test = and i128 %ld, %bit
+  %cmp = icmp ne i128 %test, 0
+  ret i1 %cmp
+}
+
+define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_ne_i128:
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, (%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movl 24(%esp,%esi), %edi
-; X86-NEXT:    movl 28(%esp,%esi), %eax
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 16(%esp,%esi), %edx
-; X86-NEXT:    movl 20(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl 8(%ebx), %edi
-; X86-NEXT:    andl (%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    andl 12(%ebx), %eax
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %rsi, %rax
-; SSE-NEXT:    andq 8(%rdi), %rdx
-; SSE-NEXT:    andq (%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    retq
+; X64-LABEL: complement_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ld = load i128, ptr %word
+  %test = and i128 %ld, %bit
+  %res = xor i128 %ld, %bit
+  %cmp = icmp ne i128 %test, 0
+  store i128 %res, ptr %word
+  ret i1 %cmp
+}
+
+define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: reset_eq_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
 ;
-; AVX2-LABEL: test_ne_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    movl $1, %edx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %rdx, %rsi
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rdx, %rsi
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    andq 8(%rdi), %rsi
-; AVX2-NEXT:    andq (%rdi), %rdx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    retq
+; X64-LABEL: reset_eq_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setae %al
+; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %mask = xor i128 %bit, -1
+  %ld = load i128, ptr %word
+  %test = and i128 %ld, %bit
+  %res = and i128 %ld, %mask
+  %cmp = icmp eq i128 %test, 0
+  store i128 %res, ptr %word
+  ret i1 %cmp
+}
+
+define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: set_ne_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
 ;
-; AVX512-LABEL: test_ne_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    xorl %edx, %edx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shlxq %rcx, %rax, %rax
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    cmovneq %rsi, %rax
-; AVX512-NEXT:    andq 8(%rdi), %rdx
-; AVX512-NEXT:    andq (%rdi), %rax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    retq
+; X64-LABEL: set_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btsl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
   %ld = load i128, ptr %word
   %test = and i128 %ld, %bit
+  %res = or i128 %ld, %bit
   %cmp = icmp ne i128 %test, 0
+  store i128 %res, ptr %word
   ret i1 %cmp
 }
 
-define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
-; X86-LABEL: complement_ne_i128:
+define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
+; X86-LABEL: init_eq_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
@@ -629,8 +606,9 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    subl $96, %esp
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movzbl 16(%ebp), %ebx
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -644,179 +622,85 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl 64(%esp,%eax), %edx
+; X86-NEXT:    movl 68(%esp,%eax), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 12(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movl 72(%esp,%esi), %ebx
+; X86-NEXT:    movl 76(%esp,%esi), %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: complement_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    xorq %rcx, %rsi
-; SSE-NEXT:    xorq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: complement_ne_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %r8
-; AVX-NEXT:    andq %rsi, %r8
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    andq %rdx, %r9
-; AVX-NEXT:    xorq %rcx, %rsi
-; AVX-NEXT:    xorq %rax, %rdx
-; AVX-NEXT:    orq %r8, %r9
-; AVX-NEXT:    setne %al
-; AVX-NEXT:    movq %rdx, (%rdi)
-; AVX-NEXT:    movq %rsi, 8(%rdi)
-; AVX-NEXT:    retq
-  %rem = and i32 %position, 127
-  %ofs = zext nneg i32 %rem to i128
-  %bit = shl nuw i128 1, %ofs
-  %ld = load i128, ptr %word
-  %test = and i128 %ld, %bit
-  %res = xor i128 %ld, %bit
-  %cmp = icmp ne i128 %test, 0
-  store i128 %res, ptr %word
-  ret i1 %cmp
-}
-
-define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
-; X86-LABEL: reset_eq_i128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    notl %edi
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NEXT:    movl 60(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %esi
-; X86-NEXT:    movl 52(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl 8(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl (%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %eax
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl 36(%esp,%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl %ebx, %ecx
+; X86-NEXT:    movl 40(%esp,%ecx), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    andl 8(%eax), %edi
+; X86-NEXT:    orl %edx, %edi
 ; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    movl %edx, 8(%edi)
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %esi, (%edi)
-; X86-NEXT:    movl %ecx, 4(%edi)
-; X86-NEXT:    sete %al
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl 44(%esp,%eax), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    andl 12(%ecx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl 32(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    andl (%eax), %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    notl %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    andl 4(%ecx), %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    andl $96, %eax
+; X86-NEXT:    shrl $3, %eax
+; X86-NEXT:    movl (%ecx,%eax), %eax
+; X86-NEXT:    btl %esi, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -824,433 +708,274 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_eq_i128:
+; SSE-LABEL: init_eq_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    xorl %r8d, %r8d
+; SSE-NEXT:    shldq %cl, %rsi, %r8
+; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    movl %edx, %eax
+; SSE-NEXT:    xorl %edx, %edx
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    shlq %cl, %rax
+; SSE-NEXT:    xorl %r9d, %r9d
 ; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
+; SSE-NEXT:    cmovneq %rsi, %r8
+; SSE-NEXT:    cmovneq %r9, %rsi
+; SSE-NEXT:    notq %r8
 ; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
+; SSE-NEXT:    cmovneq %r9, %rax
 ; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    notq %rdx
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    andq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    andq 8(%rdi), %r8
+; SSE-NEXT:    orq %rdx, %r8
+; SSE-NEXT:    andq (%rdi), %rsi
+; SSE-NEXT:    orq %rax, %rsi
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    andl $96, %eax
+; SSE-NEXT:    shrl $3, %eax
+; SSE-NEXT:    movl (%rdi,%rax), %eax
+; SSE-NEXT:    btl %ecx, %eax
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    movq %r8, 8(%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: reset_eq_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    andnq %rcx, %rsi, %r8
-; AVX-NEXT:    andq %rsi, %rcx
-; AVX-NEXT:    andnq %rax, %rdx, %rsi
-; AVX-NEXT:    andq %rdx, %rax
-; AVX-NEXT:    orq %rcx, %rax
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    movq %rsi, (%rdi)
-; AVX-NEXT:    movq %r8, 8(%rdi)
-; AVX-NEXT:    retq
+; AVX2-LABEL: init_eq_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    movl $1, %eax
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %rax, %rsi
+; AVX2-NEXT:    movl %edx, %edx
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    shldq %cl, %rdx, %r8
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    shlxq %rcx, %rax, %rax
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %rax, %rsi
+; AVX2-NEXT:    cmovneq %r9, %rax
+; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT:    cmovneq %rdx, %r8
+; AVX2-NEXT:    cmovneq %r9, %rdx
+; AVX2-NEXT:    andnq 8(%rdi), %rsi, %rsi
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    andnq (%rdi), %rax, %r8
+; AVX2-NEXT:    orq %rdx, %r8
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    andl $96, %eax
+; AVX2-NEXT:    shrl $3, %eax
+; AVX2-NEXT:    movl (%rdi,%rax), %eax
+; AVX2-NEXT:    btl %ecx, %eax
+; AVX2-NEXT:    setae %al
+; AVX2-NEXT:    movq %r8, (%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: init_eq_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %eax
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %rax, %rsi
+; AVX512-NEXT:    xorl %r8d, %r8d
+; AVX512-NEXT:    shlxq %rcx, %rax, %rax
+; AVX512-NEXT:    movl %edx, %edx
+; AVX512-NEXT:    xorl %r9d, %r9d
+; AVX512-NEXT:    shldq %cl, %rdx, %r9
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %rax, %rsi
+; AVX512-NEXT:    cmovneq %r8, %rax
+; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT:    cmovneq %rdx, %r9
+; AVX512-NEXT:    cmovneq %r8, %rdx
+; AVX512-NEXT:    andnq 8(%rdi), %rsi, %rsi
+; AVX512-NEXT:    orq %r9, %rsi
+; AVX512-NEXT:    andnq (%rdi), %rax, %r8
+; AVX512-NEXT:    orq %rdx, %r8
+; AVX512-NEXT:    movl %ecx, %eax
+; AVX512-NEXT:    andl $96, %eax
+; AVX512-NEXT:    shrl $3, %eax
+; AVX512-NEXT:    movl (%rdi,%rax), %eax
+; AVX512-NEXT:    btl %ecx, %eax
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq %rsi, 8(%rdi)
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
   %mask = xor i128 %bit, -1
+  %val0 = zext i1 %value to i128
+  %val = shl nuw i128 %val0, %ofs
   %ld = load i128, ptr %word
   %test = and i128 %ld, %bit
-  %res = and i128 %ld, %mask
+  %res0 = and i128 %ld, %mask
+  %res = or i128 %res0, %val
   %cmp = icmp eq i128 %test, 0
   store i128 %res, ptr %word
   ret i1 %cmp
 }
 
-define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
-; X86-LABEL: set_ne_i128:
+; i512
+
+define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: test_ne_i512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $60, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $60, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
+  %rem = and i32 %position, 511
+  %ofs = zext nneg i32 %rem to i512
+  %bit = shl nuw i512 1, %ofs
+  %ld = load i512, ptr %word
+  %test = and i512 %ld, %bit
+  %cmp = icmp ne i512 %test, 0
+  ret i1 %cmp
+}
+
+define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 12(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: set_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    orq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: set_ne_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %r8
-; AVX-NEXT:    andq %rsi, %r8
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    andq %rdx, %r9
-; AVX-NEXT:    orq %rcx, %rsi
-; AVX-NEXT:    orq %rax, %rdx
-; AVX-NEXT:    orq %r8, %r9
-; AVX-NEXT:    setne %al
-; AVX-NEXT:    movq %rdx, (%rdi)
-; AVX-NEXT:    movq %rsi, 8(%rdi)
-; AVX-NEXT:    retq
-  %rem = and i32 %position, 127
-  %ofs = zext nneg i32 %rem to i128
-  %bit = shl nuw i128 1, %ofs
-  %ld = load i128, ptr %word
-  %test = and i128 %ld, %bit
-  %res = or i128 %ld, %bit
-  %cmp = icmp ne i128 %test, 0
-  store i128 %res, ptr %word
+; X64-LABEL: complement_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 511
+  %ofs = zext nneg i32 %rem to i512
+  %bit = shl nuw i512 1, %ofs
+  %ld = load i512, ptr %word
+  %test = and i512 %ld, %bit
+  %res = xor i512 %ld, %bit
+  %cmp = icmp ne i512 %test, 0
+  store i512 %res, ptr %word
   ret i1 %cmp
 }
 
-define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
-; X86-LABEL: init_eq_i128:
+define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: reset_eq_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movzbl 16(%ebp), %eax
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrb $3, %dl
-; X86-NEXT:    andb $12, %dl
-; X86-NEXT:    negb %dl
-; X86-NEXT:    movsbl %dl, %esi
-; X86-NEXT:    movl 64(%esp,%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ecx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NEXT:    movl 104(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 108(%esp,%ebx), %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 96(%esp,%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl %edx, 4(%ecx)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: init_eq_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %r9d, %r9d
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %r9, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %r9, %rax
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    andq %r8, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    movq %rcx, %r11
-; SSE-NEXT:    andq %rsi, %r11
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq %r9, %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    orq %r10, %r11
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: init_eq_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %esi
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    xorl %r8d, %r8d
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rsi
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    cmovneq %r8, %rsi
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT:    cmovneq %rcx, %r9
-; AVX2-NEXT:    cmovneq %r8, %rcx
-; AVX2-NEXT:    movq (%rdi), %rdx
-; AVX2-NEXT:    movq 8(%rdi), %r8
-; AVX2-NEXT:    andnq %r8, %rax, %r10
-; AVX2-NEXT:    andq %rax, %r8
-; AVX2-NEXT:    andnq %rdx, %rsi, %r11
-; AVX2-NEXT:    andq %rsi, %rdx
-; AVX2-NEXT:    orq %r9, %r10
-; AVX2-NEXT:    orq %rcx, %r11
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    movq %r11, (%rdi)
-; AVX2-NEXT:    movq %r10, 8(%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %esi
-; AVX512-NEXT:    xorl %r8d, %r8d
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rsi
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    xorl %r9d, %r9d
-; AVX512-NEXT:    shldq %cl, %rdx, %r9
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rsi, %r8
-; AVX512-NEXT:    cmovneq %rax, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT:    cmovneq %rcx, %r9
-; AVX512-NEXT:    cmovneq %rax, %rcx
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    movq 8(%rdi), %rdx
-; AVX512-NEXT:    andnq %rdx, %r8, %r10
-; AVX512-NEXT:    andq %r8, %rdx
-; AVX512-NEXT:    andnq %rax, %rsi, %r8
-; AVX512-NEXT:    andq %rsi, %rax
-; AVX512-NEXT:    orq %r9, %r10
-; AVX512-NEXT:    orq %rcx, %r8
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    movq %r8, (%rdi)
-; AVX512-NEXT:    movq %r10, 8(%rdi)
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 127
-  %ofs = zext nneg i32 %rem to i128
-  %bit = shl nuw i128 1, %ofs
-  %mask = xor i128 %bit, -1
-  %val0 = zext i1 %value to i128
-  %val = shl nuw i128 %val0, %ofs
-  %ld = load i128, ptr %word
-  %test = and i128 %ld, %bit
-  %res0 = and i128 %ld, %mask
-  %res = or i128 %res0, %val
-  %cmp = icmp eq i128 %test, 0
-  store i128 %res, ptr %word
+; X64-LABEL: reset_eq_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setae %al
+; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 511
+  %ofs = zext nneg i32 %rem to i512
+  %bit = shl nuw i512 1, %ofs
+  %mask = xor i512 %bit, -1
+  %ld = load i512, ptr %word
+  %test = and i512 %ld, %bit
+  %res = and i512 %ld, %mask
+  %cmp = icmp eq i512 %test, 0
+  store i512 %res, ptr %word
   ret i1 %cmp
 }
 
-; i512
+define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: set_ne_i512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: set_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btsl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 511
+  %ofs = zext nneg i32 %rem to i512
+  %bit = shl nuw i512 1, %ofs
+  %ld = load i512, ptr %word
+  %test = and i512 %ld, %bit
+  %res = or i512 %ld, %bit
+  %cmp = icmp ne i512 %test, 0
+  store i512 %res, ptr %word
+  ret i1 %cmp
+}
 
-define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: test_ne_i512:
+define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
+; X86-LABEL: init_eq_i512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
@@ -1258,14 +983,14 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $224, %esp
+; X86-NEXT:    subl $352, %esp # imm = 0x160
 ; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $60, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1298,325 +1023,88 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %eax
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %edi
+; X86-NEXT:    movl 56(%eax), %esi
+; X86-NEXT:    movl 60(%eax), %ebx
+; X86-NEXT:    movl 52(%eax), %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
+; X86-NEXT:    movl 48(%eax), %edi
+; X86-NEXT:    movl 44(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 40(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 24(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 20(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 4(%eax), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl 16(%ebp), %eax
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $31, %ecx
 ; X86-NEXT:    shldl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl 40(%ebx), %eax
-; X86-NEXT:    andl 8(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 56(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 24(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    andl 44(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 12(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 60(%edi), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 28(%edi), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl 192(%esp,%edx), %edx
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl 32(%ebx), %ecx
-; X86-NEXT:    andl (%ebx), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    andl 16(%ebx), %edi
-; X86-NEXT:    andl 48(%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 36(%ebx), %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 20(%ebx), %ecx
-; X86-NEXT:    andl 52(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: test_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq -48(%rsp,%rbx), %rdx
-; SSE-NEXT:    movq -40(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq -16(%rsp,%rbx), %r11
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r10
-; SSE-NEXT:    shldq %cl, %r11, %r10
-; SSE-NEXT:    movq -32(%rsp,%rbx), %r9
-; SSE-NEXT:    movq -24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r8
-; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -56(%rsp,%rbx), %rsi
-; SSE-NEXT:    shldq %cl, %rsi, %rdx
-; SSE-NEXT:    shldq %cl, %r15, %r11
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -64(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %rsi
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    andq 32(%rdi), %r9
-; SSE-NEXT:    andq 48(%rdi), %r11
-; SSE-NEXT:    andq 16(%rdi), %rdx
-; SSE-NEXT:    orq %r11, %rdx
-; SSE-NEXT:    andq 40(%rdi), %r8
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    andq (%rdi), %rbx
-; SSE-NEXT:    orq %r9, %rbx
-; SSE-NEXT:    orq %rdx, %rbx
-; SSE-NEXT:    andq 8(%rdi), %rsi
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rsi
-; AVX2-NEXT:    movq -48(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq -40(%rsp,%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rax
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq -16(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq -8(%rsp,%rsi), %r10
-; AVX2-NEXT:    shldq %cl, %r11, %r10
-; AVX2-NEXT:    movq -32(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq -24(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq %r14, %r8
-; AVX2-NEXT:    shldq %cl, %r9, %r8
-; AVX2-NEXT:    movq -64(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq -56(%rsp,%rsi), %rsi
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    shldq %cl, %r14, %r11
-; AVX2-NEXT:    shldq %cl, %rbx, %r9
-; AVX2-NEXT:    shldq %cl, %r15, %rsi
-; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX2-NEXT:    andq 32(%rdi), %r9
-; AVX2-NEXT:    andq 48(%rdi), %r11
-; AVX2-NEXT:    andq 16(%rdi), %rdx
-; AVX2-NEXT:    andq 40(%rdi), %r8
-; AVX2-NEXT:    andq 56(%rdi), %r10
-; AVX2-NEXT:    andq 24(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %rdx
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    andq (%rdi), %rcx
-; AVX2-NEXT:    orq %r9, %rcx
-; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    andq 8(%rdi), %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rax, %rsi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq -48(%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq -16(%rsp,%rbx), %r11
-; AVX512-NEXT:    movq -8(%rsp,%rbx), %r10
-; AVX512-NEXT:    shldq %cl, %r11, %r10
-; AVX512-NEXT:    movq -32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT:    movq %r15, %r8
-; AVX512-NEXT:    shldq %cl, %r9, %r8
-; AVX512-NEXT:    movq -56(%rsp,%rbx), %rsi
-; AVX512-NEXT:    shldq %cl, %rsi, %rdx
-; AVX512-NEXT:    shldq %cl, %r15, %r11
-; AVX512-NEXT:    shldq %cl, %r14, %r9
-; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT:    shldq %cl, %rbx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT:    andq 32(%rdi), %r9
-; AVX512-NEXT:    andq 48(%rdi), %r11
-; AVX512-NEXT:    andq 16(%rdi), %rdx
-; AVX512-NEXT:    andq 40(%rdi), %r8
-; AVX512-NEXT:    andq 56(%rdi), %r10
-; AVX512-NEXT:    andq 24(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %rdx
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    andq (%rdi), %rcx
-; AVX512-NEXT:    orq %r9, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    andq 8(%rdi), %rsi
-; AVX512-NEXT:    orq %r8, %rsi
-; AVX512-NEXT:    orq %rax, %rsi
-; AVX512-NEXT:    orq %rcx, %rsi
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 511
-  %ofs = zext nneg i32 %rem to i512
-  %bit = shl nuw i512 1, %ofs
-  %ld = load i512, ptr %word
-  %test = and i512 %ld, %bit
-  %cmp = icmp ne i512 %test, 0
-  ret i1 %cmp
-}
-
-define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: complement_ne_i512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $272, %esp # imm = 0x110
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    shldl %cl, %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1632,7 +1120,6 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -1649,191 +1136,131 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%eax), %esi
+; X86-NEXT:    movl 60(%eax), %edi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    andl 60(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 52(%eax), %edi
 ; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 56(%edx), %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 48(%eax), %esi
 ; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl 52(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 44(%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    andl 48(%edx), %ebx
+; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl 56(%edx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 40(%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    andl 44(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl 24(%edx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%eax), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 36(%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    andl 40(%edx), %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 32(%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    andl 36(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 28(%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    andl 32(%edx), %ebx
 ; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl 60(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 24(%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    andl 28(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 240(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 32(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 16(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl 52(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ebx, 60(%edx)
-; X86-NEXT:    movl %edi, 56(%edx)
-; X86-NEXT:    movl %ecx, 52(%edx)
-; X86-NEXT:    movl %esi, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 20(%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    andl 24(%edx), %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 16(%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    andl 20(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 12(%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    andl 16(%edx), %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 8(%eax), %esi
+; X86-NEXT:    shldl %cl, %esi, %edi
+; X86-NEXT:    andl 12(%edx), %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl 4(%eax), %edi
+; X86-NEXT:    shldl %cl, %edi, %esi
+; X86-NEXT:    andl 8(%edx), %ebx
+; X86-NEXT:    orl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    notl %esi
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    andl 4(%edx), %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    notl %esi
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    andl (%edx), %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%edx,%eax), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 60(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 56(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 52(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 48(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 44(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 40(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 36(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 32(%edx)
@@ -1847,15 +1274,10 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    movl %eax, 16(%edx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebx, 8(%edx)
+; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %esi, (%edx)
+; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -1863,7 +1285,7 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: complement_ne_i512:
+; SSE-LABEL: init_eq_i512:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rbp
 ; SSE-NEXT:    pushq %r15
@@ -1871,94 +1293,121 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; SSE-NEXT:    pushq %r13
 ; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
+; SSE-NEXT:    subq $168, %rsp
+; SSE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    movl %esi, %eax
+; SSE-NEXT:    shrl $3, %eax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    andl $56, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    movslq %eax, %r12
+; SSE-NEXT:    movq 136(%rsp,%r12), %r9
+; SSE-NEXT:    movq 144(%rsp,%r12), %rax
+; SSE-NEXT:    movq %rax, %rsi
+; SSE-NEXT:    shldq %cl, %r9, %rsi
+; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    movq 152(%rsp,%r12), %r11
+; SSE-NEXT:    shldq %cl, %rax, %r11
+; SSE-NEXT:    movq 120(%rsp,%r12), %r10
+; SSE-NEXT:    movq 128(%rsp,%r12), %rax
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    shldq %cl, %r10, %rbx
+; SSE-NEXT:    shldq %cl, %rax, %r9
+; SSE-NEXT:    movq 104(%rsp,%r12), %r14
+; SSE-NEXT:    movq 112(%rsp,%r12), %rax
+; SSE-NEXT:    movq %rax, %r15
+; SSE-NEXT:    shldq %cl, %r14, %r15
+; SSE-NEXT:    shldq %cl, %rax, %r10
+; SSE-NEXT:    movq 96(%rsp,%r12), %rax
+; SSE-NEXT:    movq %rax, %r13
+; SSE-NEXT:    shlq %cl, %r13
+; SSE-NEXT:    shldq %cl, %rax, %r14
+; SSE-NEXT:    movl %edx, %eax
+; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq (%rsp,%rbx), %rsi
-; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
-; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT:    movq 8(%rsp,%r12), %r8
+; SSE-NEXT:    movq 16(%rsp,%r12), %rsi
+; SSE-NEXT:    movq %rsi, %rbp
 ; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
-; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    notq %rax
+; SSE-NEXT:    andq 48(%rdi), %rax
+; SSE-NEXT:    orq %rbp, %rax
+; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    notq %rbx
+; SSE-NEXT:    notq %r11
+; SSE-NEXT:    movq 24(%rsp,%r12), %rax
+; SSE-NEXT:    shldq %cl, %rsi, %rax
+; SSE-NEXT:    movq -8(%rsp,%r12), %rbp
+; SSE-NEXT:    movq (%rsp,%r12), %rdx
+; SSE-NEXT:    movq %rdx, %rsi
+; SSE-NEXT:    shldq %cl, %rbp, %rsi
+; SSE-NEXT:    andq 56(%rdi), %r11
+; SSE-NEXT:    andq 32(%rdi), %rbx
+; SSE-NEXT:    orq %rax, %r11
+; SSE-NEXT:    orq %rsi, %rbx
+; SSE-NEXT:    notq %r15
+; SSE-NEXT:    shldq %cl, %rdx, %r8
+; SSE-NEXT:    notq %r9
+; SSE-NEXT:    andq 40(%rdi), %r9
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    movq -24(%rsp,%r12), %rax
+; SSE-NEXT:    movq -16(%rsp,%r12), %rdx
+; SSE-NEXT:    movq %rdx, %rsi
+; SSE-NEXT:    shldq %cl, %rax, %rsi
+; SSE-NEXT:    andq 16(%rdi), %r15
+; SSE-NEXT:    orq %rsi, %r15
+; SSE-NEXT:    shldq %cl, %rdx, %rbp
+; SSE-NEXT:    notq %r10
+; SSE-NEXT:    notq %r13
+; SSE-NEXT:    movq -32(%rsp,%r12), %rdx
+; SSE-NEXT:    movq %rdx, %rsi
+; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    andq 24(%rdi), %r10
+; SSE-NEXT:    andq (%rdi), %r13
+; SSE-NEXT:    orq %rbp, %r10
+; SSE-NEXT:    orq %rsi, %r13
+; SSE-NEXT:    notq %r14
 ; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    movq 24(%rdi), %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 16(%rdi), %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %r8, %r13
-; SSE-NEXT:    andq %rsi, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %rcx, %r13
-; SSE-NEXT:    andq %rbp, %r13
-; SSE-NEXT:    andq %rax, %r15
-; SSE-NEXT:    orq %r13, %r15
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    movq %r14, %rcx
-; SSE-NEXT:    andq %r9, %rcx
-; SSE-NEXT:    movq (%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rbx, %r13
-; SSE-NEXT:    orq %rcx, %r13
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r12
-; SSE-NEXT:    andq %r10, %r12
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rax
-; SSE-NEXT:    andq %r11, %rax
-; SSE-NEXT:    orq %r12, %rax
-; SSE-NEXT:    orq %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    xorq %rcx, %r10
-; SSE-NEXT:    xorq %r14, %r9
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    xorq %rdx, %r11
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    movq %r8, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r9, 32(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %rsi, 16(%rdi)
-; SSE-NEXT:    movq %r15, 24(%rdi)
-; SSE-NEXT:    movq %rbx, (%rdi)
-; SSE-NEXT:    movq %r11, 8(%rdi)
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $56, %rsp
+; SSE-NEXT:    shldq %cl, %rdx, %rax
+; SSE-NEXT:    andq 8(%rdi), %r14
+; SSE-NEXT:    orq %rax, %r14
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    andl $60, %eax
+; SSE-NEXT:    movl (%rdi,%rax), %eax
+; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; SSE-NEXT:    btl %ecx, %eax
+; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT:    movq %rax, 48(%rdi)
+; SSE-NEXT:    movq %r11, 56(%rdi)
+; SSE-NEXT:    movq %rbx, 32(%rdi)
+; SSE-NEXT:    movq %r9, 40(%rdi)
+; SSE-NEXT:    movq %r15, 16(%rdi)
+; SSE-NEXT:    movq %r10, 24(%rdi)
+; SSE-NEXT:    movq %r13, (%rdi)
+; SSE-NEXT:    movq %r14, 8(%rdi)
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    addq $168, %rsp
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r12
 ; SSE-NEXT:    popq %r13
@@ -1967,7 +1416,7 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: complement_ne_i512:
+; AVX2-LABEL: init_eq_i512:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rbp
 ; AVX2-NEXT:    pushq %r15
@@ -1975,4540 +1424,105 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $72, %rsp
+; AVX2-NEXT:    subq $184, %rsp
 ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rbx
-; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT:    movq %rbp, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT:    shldq %cl, %r8, %r13
-; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    shldq %cl, %r9, %r10
-; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT:    shldq %cl, %r11, %rsi
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r8, %r14
-; AVX2-NEXT:    andq %rsi, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq 56(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r13, %r15
-; AVX2-NEXT:    movq 24(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %rax, %r14
-; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r9, %r15
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq (%rdi), %rbx
-; AVX2-NEXT:    movq %rbx, %rbp
-; AVX2-NEXT:    andq %rax, %rbp
-; AVX2-NEXT:    orq %r15, %rbp
-; AVX2-NEXT:    orq %r12, %rbp
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    andq %r10, %rcx
-; AVX2-NEXT:    movq 8(%rdi), %r15
-; AVX2-NEXT:    movq %r15, %r12
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    orq %rcx, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT:    xorq %rax, %r10
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq %r15, %r11
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    movq %r8, 48(%rdi)
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r9, 32(%rdi)
-; AVX2-NEXT:    movq %r10, 40(%rdi)
-; AVX2-NEXT:    movq %rsi, 16(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %rbx, (%rdi)
-; AVX2-NEXT:    movq %r11, 8(%rdi)
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $72, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT:    movq %rbp, %rax
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT:    shldq %cl, %r8, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    shldq %cl, %r9, %r10
-; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT:    shldq %cl, %r11, %rsi
-; AVX512-NEXT:    shldq %cl, %r14, %r8
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r8, %r14
-; AVX512-NEXT:    andq %rsi, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq 56(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r13, %r15
-; AVX512-NEXT:    movq 24(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %rax, %r14
-; AVX512-NEXT:    orq %r15, %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq 32(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r9, %r15
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rdi), %rbx
-; AVX512-NEXT:    movq %rbx, %rbp
-; AVX512-NEXT:    andq %rax, %rbp
-; AVX512-NEXT:    orq %r15, %rbp
-; AVX512-NEXT:    orq %r12, %rbp
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rdx, %r11
-; AVX512-NEXT:    movq 40(%rdi), %rax
-; AVX512-NEXT:    movq %rax, %rcx
-; AVX512-NEXT:    andq %r10, %rcx
-; AVX512-NEXT:    movq 8(%rdi), %r15
-; AVX512-NEXT:    movq %r15, %r12
-; AVX512-NEXT:    andq %r11, %r12
-; AVX512-NEXT:    orq %rcx, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %rax, %r10
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %r15, %r11
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    movq %r8, 48(%rdi)
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r9, 32(%rdi)
-; AVX512-NEXT:    movq %r10, 40(%rdi)
-; AVX512-NEXT:    movq %rsi, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 24(%rdi)
-; AVX512-NEXT:    movq %rbx, (%rdi)
-; AVX512-NEXT:    movq %r11, 8(%rdi)
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $72, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 511
-  %ofs = zext nneg i32 %rem to i512
-  %bit = shl nuw i512 1, %ofs
-  %ld = load i512, ptr %word
-  %test = and i512 %ld, %bit
-  %res = xor i512 %ld, %bit
-  %cmp = icmp ne i512 %test, 0
-  store i512 %res, ptr %word
-  ret i1 %cmp
-}
-
-define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: reset_eq_i512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $288, %esp # imm = 0x120
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 4(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edi), %eax
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 12(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edi), %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 52(%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl 44(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 256(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl 32(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl 52(%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 60(%eax)
-; X86-NEXT:    movl %esi, 56(%eax)
-; X86-NEXT:    movl %ecx, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 48(%eax)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: reset_eq_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rdx
-; SSE-NEXT:    movq (%rsp,%rdx), %r9
-; SSE-NEXT:    movq 8(%rsp,%rdx), %r8
-; SSE-NEXT:    movq %r8, %rsi
-; SSE-NEXT:    shldq %cl, %r9, %rsi
-; SSE-NEXT:    movq -8(%rsp,%rdx), %rax
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    movq 16(%rsp,%rdx), %r14
-; SSE-NEXT:    movq 24(%rsp,%rdx), %r10
-; SSE-NEXT:    movq %r10, %rbx
-; SSE-NEXT:    shldq %cl, %r14, %rbx
-; SSE-NEXT:    shldq %cl, %r8, %r14
-; SSE-NEXT:    movq 32(%rsp,%rdx), %r13
-; SSE-NEXT:    movq 40(%rsp,%rdx), %r12
-; SSE-NEXT:    shldq %cl, %r13, %r12
-; SSE-NEXT:    shldq %cl, %r10, %r13
-; SSE-NEXT:    movq -16(%rsp,%rdx), %rdx
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %r12, %rbp
-; SSE-NEXT:    movq %r9, %r15
-; SSE-NEXT:    movq %rsi, %r11
-; SSE-NEXT:    movq 16(%rdi), %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r13
-; SSE-NEXT:    andq %r8, %r9
-; SSE-NEXT:    orq %r13, %r9
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r12
-; SSE-NEXT:    movq 24(%rdi), %r10
-; SSE-NEXT:    andq %r10, %rsi
-; SSE-NEXT:    orq %r12, %rsi
-; SSE-NEXT:    movq %r14, %r13
-; SSE-NEXT:    movq 32(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r14
-; SSE-NEXT:    movq %rdx, %r12
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %rdx
-; SSE-NEXT:    orq %r14, %rdx
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    movq %rbx, %r14
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    andq %rcx, %rbx
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    movq 8(%rdi), %r8
-; SSE-NEXT:    andq %r8, %rax
-; SSE-NEXT:    orq %rbx, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq %r10, %r11
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq %rcx, %r14
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT:    notq %rbp
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    notq %rcx
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    notq %r9
-; SSE-NEXT:    andq %r8, %r9
-; SSE-NEXT:    notq %r12
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rcx, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r13, 32(%rdi)
-; SSE-NEXT:    movq %r14, 40(%rdi)
-; SSE-NEXT:    movq %r15, 16(%rdi)
-; SSE-NEXT:    movq %r11, 24(%rdi)
-; SSE-NEXT:    movq %r12, (%rdi)
-; SSE-NEXT:    movq %r9, 8(%rdi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: reset_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rdx
-; AVX2-NEXT:    movq -48(%rsp,%rdx), %r8
-; AVX2-NEXT:    movq -40(%rsp,%rdx), %rbx
-; AVX2-NEXT:    movq %rbx, %rax
-; AVX2-NEXT:    shldq %cl, %r8, %rax
-; AVX2-NEXT:    movq -16(%rsp,%rdx), %r10
-; AVX2-NEXT:    movq -8(%rsp,%rdx), %rsi
-; AVX2-NEXT:    shldq %cl, %r10, %rsi
-; AVX2-NEXT:    movq -32(%rsp,%rdx), %r11
-; AVX2-NEXT:    movq -24(%rsp,%rdx), %r14
-; AVX2-NEXT:    movq %r14, %r9
-; AVX2-NEXT:    shldq %cl, %r11, %r9
-; AVX2-NEXT:    movq -64(%rsp,%rdx), %r15
-; AVX2-NEXT:    movq -56(%rsp,%rdx), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, %r8
-; AVX2-NEXT:    shldq %cl, %r14, %r10
-; AVX2-NEXT:    shldq %cl, %rbx, %r11
-; AVX2-NEXT:    shldq %cl, %r15, %rdx
-; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX2-NEXT:    movq 24(%rdi), %rbx
-; AVX2-NEXT:    movq 56(%rdi), %r14
-; AVX2-NEXT:    movq 16(%rdi), %r15
-; AVX2-NEXT:    movq 48(%rdi), %r13
-; AVX2-NEXT:    movq 32(%rdi), %rbp
-; AVX2-NEXT:    andnq %rbp, %r11, %r12
-; AVX2-NEXT:    andq %r11, %rbp
-; AVX2-NEXT:    andnq %r13, %r10, %r11
-; AVX2-NEXT:    andq %r10, %r13
-; AVX2-NEXT:    andnq %r15, %r8, %r10
-; AVX2-NEXT:    andq %r8, %r15
-; AVX2-NEXT:    movq 40(%rdi), %r8
-; AVX2-NEXT:    orq %r13, %r15
-; AVX2-NEXT:    andnq %r8, %r9, %r13
-; AVX2-NEXT:    andq %r9, %r8
-; AVX2-NEXT:    andnq %r14, %rsi, %r9
-; AVX2-NEXT:    andq %rsi, %r14
-; AVX2-NEXT:    andnq %rbx, %rax, %rsi
-; AVX2-NEXT:    andq %rax, %rbx
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    orq %r14, %rbx
-; AVX2-NEXT:    andnq %rax, %rcx, %r14
-; AVX2-NEXT:    andq %rcx, %rax
-; AVX2-NEXT:    orq %rbp, %rax
-; AVX2-NEXT:    movq 8(%rdi), %rcx
-; AVX2-NEXT:    orq %r15, %rax
-; AVX2-NEXT:    andnq %rcx, %rdx, %r15
-; AVX2-NEXT:    andq %rdx, %rcx
-; AVX2-NEXT:    orq %r8, %rcx
-; AVX2-NEXT:    orq %rbx, %rcx
-; AVX2-NEXT:    orq %rax, %rcx
-; AVX2-NEXT:    movq %r11, 48(%rdi)
-; AVX2-NEXT:    movq %r9, 56(%rdi)
-; AVX2-NEXT:    movq %r12, 32(%rdi)
-; AVX2-NEXT:    movq %r13, 40(%rdi)
-; AVX2-NEXT:    movq %r10, 16(%rdi)
-; AVX2-NEXT:    movq %rsi, 24(%rdi)
-; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    movq %r15, 8(%rdi)
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: reset_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq -48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %r8, %rax
-; AVX512-NEXT:    movq -16(%rsp,%rbx), %r10
-; AVX512-NEXT:    movq -8(%rsp,%rbx), %rsi
-; AVX512-NEXT:    shldq %cl, %r10, %rsi
-; AVX512-NEXT:    movq -32(%rsp,%rbx), %r11
-; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT:    movq %r15, %r9
-; AVX512-NEXT:    shldq %cl, %r11, %r9
-; AVX512-NEXT:    movq -56(%rsp,%rbx), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, %r8
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    shldq %cl, %r14, %r11
-; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT:    shldq %cl, %rbx, %rdx
-; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT:    movq 24(%rdi), %rbx
-; AVX512-NEXT:    movq 56(%rdi), %r14
-; AVX512-NEXT:    movq 16(%rdi), %r15
-; AVX512-NEXT:    movq 48(%rdi), %r13
-; AVX512-NEXT:    movq 32(%rdi), %rbp
-; AVX512-NEXT:    andnq %rbp, %r11, %r12
-; AVX512-NEXT:    andq %r11, %rbp
-; AVX512-NEXT:    andnq %r13, %r10, %r11
-; AVX512-NEXT:    andq %r10, %r13
-; AVX512-NEXT:    andnq %r15, %r8, %r10
-; AVX512-NEXT:    andq %r8, %r15
-; AVX512-NEXT:    movq 40(%rdi), %r8
-; AVX512-NEXT:    orq %r13, %r15
-; AVX512-NEXT:    andnq %r8, %r9, %r13
-; AVX512-NEXT:    andq %r9, %r8
-; AVX512-NEXT:    andnq %r14, %rsi, %r9
-; AVX512-NEXT:    andq %rsi, %r14
-; AVX512-NEXT:    andnq %rbx, %rax, %rsi
-; AVX512-NEXT:    andq %rax, %rbx
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    orq %r14, %rbx
-; AVX512-NEXT:    andnq %rax, %rcx, %r14
-; AVX512-NEXT:    andq %rcx, %rax
-; AVX512-NEXT:    orq %rbp, %rax
-; AVX512-NEXT:    movq 8(%rdi), %rcx
-; AVX512-NEXT:    orq %r15, %rax
-; AVX512-NEXT:    andnq %rcx, %rdx, %r15
-; AVX512-NEXT:    andq %rdx, %rcx
-; AVX512-NEXT:    orq %r8, %rcx
-; AVX512-NEXT:    orq %rbx, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    movq %r11, 48(%rdi)
-; AVX512-NEXT:    movq %r9, 56(%rdi)
-; AVX512-NEXT:    movq %r12, 32(%rdi)
-; AVX512-NEXT:    movq %r13, 40(%rdi)
-; AVX512-NEXT:    movq %r10, 16(%rdi)
-; AVX512-NEXT:    movq %rsi, 24(%rdi)
-; AVX512-NEXT:    movq %r14, (%rdi)
-; AVX512-NEXT:    movq %r15, 8(%rdi)
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 511
-  %ofs = zext nneg i32 %rem to i512
-  %bit = shl nuw i512 1, %ofs
-  %mask = xor i512 %bit, -1
-  %ld = load i512, ptr %word
-  %test = and i512 %ld, %bit
-  %res = and i512 %ld, %mask
-  %cmp = icmp eq i512 %test, 0
-  store i512 %res, ptr %word
-  ret i1 %cmp
-}
-
-define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: set_ne_i512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $272, %esp # imm = 0x110
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl 56(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl 24(%edx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl 60(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 240(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 32(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 16(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl 52(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ebx, 60(%edx)
-; X86-NEXT:    movl %edi, 56(%edx)
-; X86-NEXT:    movl %ecx, 52(%edx)
-; X86-NEXT:    movl %esi, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: set_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq (%rsp,%rbx), %rsi
-; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
-; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
-; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    movq 24(%rdi), %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 16(%rdi), %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %r8, %r13
-; SSE-NEXT:    andq %rsi, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %rcx, %r13
-; SSE-NEXT:    andq %rbp, %r13
-; SSE-NEXT:    andq %rax, %r15
-; SSE-NEXT:    orq %r13, %r15
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    movq %r14, %rcx
-; SSE-NEXT:    andq %r9, %rcx
-; SSE-NEXT:    movq (%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rbx, %r13
-; SSE-NEXT:    orq %rcx, %r13
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r12
-; SSE-NEXT:    andq %r10, %r12
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rax
-; SSE-NEXT:    andq %r11, %rax
-; SSE-NEXT:    orq %r12, %rax
-; SSE-NEXT:    orq %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %r10
-; SSE-NEXT:    orq %r14, %r9
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    movq %r8, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r9, 32(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %rsi, 16(%rdi)
-; SSE-NEXT:    movq %r15, 24(%rdi)
-; SSE-NEXT:    movq %rbx, (%rdi)
-; SSE-NEXT:    movq %r11, 8(%rdi)
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: set_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rbx
-; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT:    movq %rbp, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT:    shldq %cl, %r8, %r13
-; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    shldq %cl, %r9, %r10
-; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT:    shldq %cl, %r11, %rsi
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r8, %r14
-; AVX2-NEXT:    andq %rsi, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq 56(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r13, %r15
-; AVX2-NEXT:    movq 24(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %rax, %r14
-; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r9, %r15
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq (%rdi), %rbx
-; AVX2-NEXT:    movq %rbx, %rbp
-; AVX2-NEXT:    andq %rax, %rbp
-; AVX2-NEXT:    orq %r15, %rbp
-; AVX2-NEXT:    orq %r12, %rbp
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    andq %r10, %rcx
-; AVX2-NEXT:    movq 8(%rdi), %r15
-; AVX2-NEXT:    movq %r15, %r12
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    orq %rcx, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rax, %r10
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r15, %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    movq %r8, 48(%rdi)
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r9, 32(%rdi)
-; AVX2-NEXT:    movq %r10, 40(%rdi)
-; AVX2-NEXT:    movq %rsi, 16(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %rbx, (%rdi)
-; AVX2-NEXT:    movq %r11, 8(%rdi)
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $72, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: set_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT:    movq %rbp, %rax
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT:    shldq %cl, %r8, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    shldq %cl, %r9, %r10
-; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT:    shldq %cl, %r11, %rsi
-; AVX512-NEXT:    shldq %cl, %r14, %r8
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r8, %r14
-; AVX512-NEXT:    andq %rsi, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq 56(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r13, %r15
-; AVX512-NEXT:    movq 24(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %rax, %r14
-; AVX512-NEXT:    orq %r15, %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq 32(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r9, %r15
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rdi), %rbx
-; AVX512-NEXT:    movq %rbx, %rbp
-; AVX512-NEXT:    andq %rax, %rbp
-; AVX512-NEXT:    orq %r15, %rbp
-; AVX512-NEXT:    orq %r12, %rbp
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rdx, %r11
-; AVX512-NEXT:    movq 40(%rdi), %rax
-; AVX512-NEXT:    movq %rax, %rcx
-; AVX512-NEXT:    andq %r10, %rcx
-; AVX512-NEXT:    movq 8(%rdi), %r15
-; AVX512-NEXT:    movq %r15, %r12
-; AVX512-NEXT:    andq %r11, %r12
-; AVX512-NEXT:    orq %rcx, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rax, %r10
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    orq %r15, %r11
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    movq %r8, 48(%rdi)
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r9, 32(%rdi)
-; AVX512-NEXT:    movq %r10, 40(%rdi)
-; AVX512-NEXT:    movq %rsi, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 24(%rdi)
-; AVX512-NEXT:    movq %rbx, (%rdi)
-; AVX512-NEXT:    movq %r11, 8(%rdi)
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $72, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 511
-  %ofs = zext nneg i32 %rem to i512
-  %bit = shl nuw i512 1, %ofs
-  %ld = load i512, ptr %word
-  %test = and i512 %ld, %bit
-  %res = or i512 %ld, %bit
-  %cmp = icmp ne i512 %test, 0
-  store i512 %res, ptr %word
-  ret i1 %cmp
-}
-
-define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
-; X86-LABEL: init_eq_i512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $432, %esp # imm = 0x1B0
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl 48(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%esi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl 16(%ebp), %ebx
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl 56(%edi), %ebx
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 52(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 48(%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 40(%edi), %ebx
-; X86-NEXT:    movl 44(%edi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 32(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 24(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%edi), %eax
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%edi), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%edi), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl (%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 60(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 56(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 48(%eax)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: init_eq_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $216, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %r10
-; SSE-NEXT:    movq 184(%rsp,%r10), %r11
-; SSE-NEXT:    movq 192(%rsp,%r10), %rsi
-; SSE-NEXT:    movq %rsi, %r13
-; SSE-NEXT:    shldq %cl, %r11, %r13
-; SSE-NEXT:    movq 200(%rsp,%r10), %r15
-; SSE-NEXT:    shldq %cl, %rsi, %r15
-; SSE-NEXT:    movq 168(%rsp,%r10), %rbx
-; SSE-NEXT:    movq 176(%rsp,%r10), %rsi
-; SSE-NEXT:    movq %rsi, %r14
-; SSE-NEXT:    shldq %cl, %rbx, %r14
-; SSE-NEXT:    shldq %cl, %rsi, %r11
-; SSE-NEXT:    movq 152(%rsp,%r10), %rax
-; SSE-NEXT:    movq 160(%rsp,%r10), %r8
-; SSE-NEXT:    movq %r8, %r12
-; SSE-NEXT:    shldq %cl, %rax, %r12
-; SSE-NEXT:    shldq %cl, %r8, %rbx
-; SSE-NEXT:    movq 144(%rsp,%r10), %r9
-; SSE-NEXT:    movq %r9, %r8
-; SSE-NEXT:    shlq %cl, %r8
-; SSE-NEXT:    shldq %cl, %r9, %rax
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movl %edx, %edx
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq 16(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rsi, %r13
-; SSE-NEXT:    andq %rdx, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %r15, %rsi
-; SSE-NEXT:    movq 56(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r15
-; SSE-NEXT:    movq %rbx, %r13
-; SSE-NEXT:    movq 24(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %rbx
-; SSE-NEXT:    orq %r15, %rbx
-; SSE-NEXT:    movq %r14, %rbp
-; SSE-NEXT:    movq 32(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r14
-; SSE-NEXT:    movq %r8, %r15
-; SSE-NEXT:    movq (%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r8
-; SSE-NEXT:    orq %r14, %r8
-; SSE-NEXT:    orq %r12, %r8
-; SSE-NEXT:    movq %r11, %r12
-; SSE-NEXT:    movq 40(%rdi), %r9
-; SSE-NEXT:    andq %r9, %r11
-; SSE-NEXT:    movq %rax, %r14
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %rax
-; SSE-NEXT:    orq %r11, %rax
-; SSE-NEXT:    orq %rbx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq 56(%rsp,%r10), %r11
-; SSE-NEXT:    movq 64(%rsp,%r10), %rax
-; SSE-NEXT:    movq %rax, %rbx
-; SSE-NEXT:    shldq %cl, %r11, %rbx
-; SSE-NEXT:    orq %rbx, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movq 72(%rsp,%r10), %rbx
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    notq %rbp
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    movq 40(%rsp,%r10), %rax
-; SSE-NEXT:    movq 48(%rsp,%r10), %rdx
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    orq %rbx, %rbp
-; SSE-NEXT:    notq %r12
-; SSE-NEXT:    andq %r9, %r12
-; SSE-NEXT:    shldq %cl, %rdx, %r11
-; SSE-NEXT:    movq 24(%rsp,%r10), %r9
-; SSE-NEXT:    movq 32(%rsp,%r10), %rdx
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    shldq %cl, %r9, %rbx
-; SSE-NEXT:    orq %r11, %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    orq %rbx, %r11
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    movq 16(%rsp,%r10), %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    orq %rdx, %r15
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    orq %r9, %r14
-; SSE-NEXT:    orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq %rax, 48(%rdi)
-; SSE-NEXT:    movq %rsi, 56(%rdi)
-; SSE-NEXT:    movq %rbp, 32(%rdi)
-; SSE-NEXT:    movq %r12, 40(%rdi)
-; SSE-NEXT:    movq %r11, 16(%rdi)
-; SSE-NEXT:    movq %r13, 24(%rdi)
-; SSE-NEXT:    movq %r15, (%rdi)
-; SSE-NEXT:    movq %r14, 8(%rdi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    addq $216, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: init_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $200, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %r8d
-; AVX2-NEXT:    andl $63, %r8d
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rsi
-; AVX2-NEXT:    movq 144(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq 152(%rsp,%rsi), %r12
-; AVX2-NEXT:    movq %r12, %r10
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %r11, %r10
-; AVX2-NEXT:    movq 176(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq 184(%rsp,%rsi), %r9
-; AVX2-NEXT:    shldq %cl, %r14, %r9
-; AVX2-NEXT:    movq 160(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq 168(%rsp,%rsi), %r13
-; AVX2-NEXT:    movq %r13, %rbx
-; AVX2-NEXT:    shldq %cl, %r15, %rbx
-; AVX2-NEXT:    movq 128(%rsp,%rsi), %rbp
-; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 136(%rsp,%rsi), %rax
-; AVX2-NEXT:    shldq %cl, %rax, %r11
-; AVX2-NEXT:    shldq %cl, %r13, %r14
-; AVX2-NEXT:    shldq %cl, %r12, %r15
-; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, (%rsp)
-; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq 48(%rdi), %rbp
-; AVX2-NEXT:    movq 32(%rdi), %r13
-; AVX2-NEXT:    andnq %r13, %r15, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r15, %r13
-; AVX2-NEXT:    andnq %rbp, %r14, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r14, %rbp
-; AVX2-NEXT:    andnq %r12, %r11, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    andnq %rax, %rbx, %rcx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rax, %rbp
-; AVX2-NEXT:    andq %rbx, %rbp
-; AVX2-NEXT:    movq 56(%rdi), %rcx
-; AVX2-NEXT:    andnq %rcx, %r9, %rbx
-; AVX2-NEXT:    andq %r9, %rcx
-; AVX2-NEXT:    movq 24(%rdi), %rax
-; AVX2-NEXT:    andnq %rax, %r10, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r10, %rax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    movq (%rdi), %r10
-; AVX2-NEXT:    andnq %r10, %rcx, %r15
-; AVX2-NEXT:    andq %rcx, %r10
-; AVX2-NEXT:    movq 40(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq 48(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq %r11, %r9
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    orq %r13, %r10
-; AVX2-NEXT:    orq %r12, %r10
-; AVX2-NEXT:    movq 8(%rdi), %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andnq %r13, %rcx, %r12
-; AVX2-NEXT:    andq %rcx, %r13
-; AVX2-NEXT:    orq %rbp, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq 56(%rsp,%rsi), %rax
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %r11, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    orq %r9, %r14
-; AVX2-NEXT:    orq %rax, %rbx
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 24(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq 32(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, %r11
-; AVX2-NEXT:    shldq %cl, %rax, %r11
-; AVX2-NEXT:    shldq %cl, %r9, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    orq %r11, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    orq %rdx, %rbx
-; AVX2-NEXT:    movq 8(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq 16(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, %r11
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    shldq %cl, %r9, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    movq (%rsp,%rsi), %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    shlxq %r8, %rsi, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    orq %rax, %r15
-; AVX2-NEXT:    orq %rdx, %r12
-; AVX2-NEXT:    orq %r10, %r13
-; AVX2-NEXT:    movq %r14, 48(%rdi)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    movq %rax, 56(%rdi)
-; AVX2-NEXT:    movq %rbp, 32(%rdi)
-; AVX2-NEXT:    movq %rbx, 40(%rdi)
-; AVX2-NEXT:    movq %r9, 16(%rdi)
-; AVX2-NEXT:    movq %r11, 24(%rdi)
-; AVX2-NEXT:    movq %r15, (%rdi)
-; AVX2-NEXT:    movq %r12, 8(%rdi)
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    addq $200, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $184, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rsi
-; AVX512-NEXT:    movq 128(%rsp,%rsi), %r10
-; AVX512-NEXT:    movq 136(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rax
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 160(%rsp,%rsi), %r14
-; AVX512-NEXT:    movq 168(%rsp,%rsi), %rax
-; AVX512-NEXT:    shldq %cl, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 144(%rsp,%rsi), %r15
-; AVX512-NEXT:    movq 152(%rsp,%rsi), %r11
-; AVX512-NEXT:    movq %r11, %rbx
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    movq 120(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %r10
-; AVX512-NEXT:    shldq %cl, %r11, %r14
-; AVX512-NEXT:    movq %rdi, %r9
-; AVX512-NEXT:    movq 112(%rsp,%rsi), %r11
-; AVX512-NEXT:    shldq %cl, %r12, %r15
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq 48(%rdi), %r13
-; AVX512-NEXT:    movq 32(%rdi), %rbp
-; AVX512-NEXT:    andnq %rbp, %r15, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r15, %rbp
-; AVX512-NEXT:    andnq %r13, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r14, %r13
-; AVX512-NEXT:    andnq %r12, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r10, %r12
-; AVX512-NEXT:    movq 40(%rdi), %r8
-; AVX512-NEXT:    orq %r13, %r12
-; AVX512-NEXT:    andnq %r8, %rbx, %rdi
-; AVX512-NEXT:    andq %rbx, %r8
-; AVX512-NEXT:    movq 56(%r9), %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    andnq %r13, %rdx, %r10
-; AVX512-NEXT:    andq %rdx, %r13
-; AVX512-NEXT:    movq 24(%r9), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    andnq %rax, %rdx, %r15
-; AVX512-NEXT:    andq %rdx, %rax
-; AVX512-NEXT:    orq %r13, %rax
-; AVX512-NEXT:    shlxq %rcx, %r11, %r13
-; AVX512-NEXT:    movq (%r9), %rdx
-; AVX512-NEXT:    andnq %rdx, %r13, %r14
-; AVX512-NEXT:    andq %r13, %rdx
-; AVX512-NEXT:    orq %rbp, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r11, %rbp
-; AVX512-NEXT:    orq %r12, %rdx
-; AVX512-NEXT:    movq 8(%r9), %r13
-; AVX512-NEXT:    andnq %r13, %rbp, %rbx
-; AVX512-NEXT:    andq %rbp, %r13
-; AVX512-NEXT:    orq %r8, %r13
-; AVX512-NEXT:    movq 24(%rsp,%rsi), %r8
-; AVX512-NEXT:    orq %rax, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, %r12
-; AVX512-NEXT:    shldq %cl, %r8, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    orq %r12, %r11
-; AVX512-NEXT:    movq 40(%rsp,%rsi), %r12
-; AVX512-NEXT:    shldq %cl, %rax, %r12
-; AVX512-NEXT:    orq %r12, %r10
-; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 8(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq 16(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rbp
-; AVX512-NEXT:    shldq %cl, %rax, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    orq %rbp, %r10
-; AVX512-NEXT:    shldq %cl, %r12, %r8
-; AVX512-NEXT:    orq %r8, %rdi
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq -8(%rsp,%rsi), %r8
-; AVX512-NEXT:    movq (%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rbp
-; AVX512-NEXT:    shldq %cl, %r8, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    orq %rbp, %rdi
-; AVX512-NEXT:    movq -16(%rsp,%rsi), %rsi
-; AVX512-NEXT:    shldq %cl, %r12, %rax
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    orq %rax, %r14
-; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    orq %rdx, %r13
-; AVX512-NEXT:    movq %r11, 48(%r9)
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, 56(%r9)
-; AVX512-NEXT:    movq %r10, 32(%r9)
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, 40(%r9)
-; AVX512-NEXT:    movq %rdi, 16(%r9)
-; AVX512-NEXT:    movq %r15, 24(%r9)
-; AVX512-NEXT:    movq %r14, (%r9)
-; AVX512-NEXT:    movq %rbx, 8(%r9)
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    addq $184, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 511
-  %ofs = zext nneg i32 %rem to i512
-  %bit = shl nuw i512 1, %ofs
-  %mask = xor i512 %bit, -1
-  %val0 = zext i1 %value to i512
-  %val = shl nuw i512 %val0, %ofs
-  %ld = load i512, ptr %word
-  %test = and i512 %ld, %bit
-  %res0 = and i512 %ld, %mask
-  %res = or i512 %res0, %val
-  %cmp = icmp eq i512 %test, 0
-  store i512 %res, ptr %word
-  ret i1 %cmp
-}
-
-; i4096
-
-define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
-; X86-LABEL: test_ne_i4096:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $1792, %esp # imm = 0x700
-; X86-NEXT:    movl 12(%ebp), %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $508, %ecx # imm = 0x1FC
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 248(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 252(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 504(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 508(%esi), %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 124(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 376(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 380(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 184(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 188(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 440(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 444(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 312(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 316(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 216(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 220(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 472(%esi), %edi
-; X86-NEXT:    movl 476(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 344(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 348(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 152(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 156(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 408(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 412(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 280(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 284(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 232(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 236(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 488(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 492(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 108(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 360(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 364(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 168(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 172(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 424(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 428(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 296(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 300(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 200(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 204(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 456(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 460(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 328(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 332(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 140(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 392(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 396(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 264(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 268(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 240(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 244(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 496(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 500(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 368(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 372(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 176(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 180(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 432(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 436(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 304(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 308(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 208(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 212(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 464(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 468(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 336(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 340(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 148(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 400(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 404(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 272(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 276(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 224(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 228(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 480(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 484(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 100(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 352(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 356(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 160(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 164(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 416(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 420(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 288(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 292(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 192(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 196(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 448(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 452(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 320(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 324(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 132(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl 256(%esi), %edi
-; X86-NEXT:    movl 260(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 388(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 4(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    shrdl %cl, %eax, %edi
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movb $32, %cl
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    jne .LBB20_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:  .LBB20_2:
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 320(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 64(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 448(%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 192(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 288(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 32(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 416(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 160(%eax), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 352(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 96(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 480(%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 224(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 272(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 16(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 400(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 144(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 336(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 80(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 464(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 208(%eax), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 304(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 48(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 432(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 176(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 368(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 112(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 496(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    andl 240(%eax), %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 264(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 8(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 392(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 136(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 328(%ebx), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 72(%ebx), %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 456(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 200(%ebx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 296(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 40(%ebx), %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 424(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 168(%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 360(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 104(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 488(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 232(%ebx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 280(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 24(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 408(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 152(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 344(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 88(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 472(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 216(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 312(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 56(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 440(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 184(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 376(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 120(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 504(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 248(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 324(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 68(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 452(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 196(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 292(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 36(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 420(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 164(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 356(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 100(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 484(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 228(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 276(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 20(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 404(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 148(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 340(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 84(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 468(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 212(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 308(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 52(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 436(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 180(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 372(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 116(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 500(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 244(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 268(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 12(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 396(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 140(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 332(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 76(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 460(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 204(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 300(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 44(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 428(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 172(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 364(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 108(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 492(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 236(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 284(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 28(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 412(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 156(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 348(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 92(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 476(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 220(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 316(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 60(%ebx), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 444(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 188(%ebx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 380(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 124(%ebx), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 508(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    andl 252(%esi), %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl 1648(%esp,%ecx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    andl 128(%edx), %ecx
-; X86-NEXT:    andl 384(%edx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    andl (%edx), %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 256(%edx), %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 260(%edx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 4(%edx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 132(%edx), %eax
-; X86-NEXT:    andl 388(%edx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: test_ne_i4096:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $1576, %rsp # imm = 0x628
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl %esi, %eax
-; SSE-NEXT:    andl $4032, %eax # imm = 0xFC0
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    movslq %eax, %rsi
-; SSE-NEXT:    movq 1296(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1304(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1552(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1560(%rsp,%rsi), %rax
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1168(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1176(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1424(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1432(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1232(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1240(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1488(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1496(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1104(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1112(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1360(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; SSE-NEXT:    movq 1368(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1272(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1528(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1144(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1400(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1200(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1208(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1464(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1072(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1080(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1336(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1288(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1544(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1160(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1416(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1216(%rsp,%rsi), %r11
-; SSE-NEXT:    movq 1224(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %r11, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1480(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1088(%rsp,%rsi), %r9
-; SSE-NEXT:    movq 1096(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %r9, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1352(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1248(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1256(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1512(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1120(%rsp,%rsi), %rax
-; SSE-NEXT:    movq 1128(%rsp,%rsi), %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rax, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1376(%rsp,%rsi), %r13
-; SSE-NEXT:    movq 1384(%rsp,%rsi), %rbx
-; SSE-NEXT:    movq %rbx, %r8
-; SSE-NEXT:    shldq %cl, %r13, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1192(%rsp,%rsi), %r15
-; SSE-NEXT:    movq %r15, %r14
-; SSE-NEXT:    shldq %cl, %rdx, %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1440(%rsp,%rsi), %r10
-; SSE-NEXT:    movq 1448(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, %r14
-; SSE-NEXT:    shldq %cl, %r10, %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1312(%rsp,%rsi), %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1320(%rsp,%rsi), %rbp
-; SSE-NEXT:    movq %rbp, %r12
-; SSE-NEXT:    shldq %cl, %r14, %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq 1064(%rsp,%rsi), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rbp, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r9
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r12, %r15
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r12, %r10
-; SSE-NEXT:    andq 384(%rdi), %r10
-; SSE-NEXT:    andq 128(%rdi), %r15
-; SSE-NEXT:    andq 320(%rdi), %r13
-; SSE-NEXT:    andq 64(%rdi), %rax
-; SSE-NEXT:    orq %r10, %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    andq 448(%rdi), %r9
-; SSE-NEXT:    andq 192(%rdi), %rbp
-; SSE-NEXT:    orq %r9, %rbp
-; SSE-NEXT:    orq %rax, %rbp
-; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq 288(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 32(%rdi), %r9
-; SSE-NEXT:    andq 416(%rdi), %rdx
-; SSE-NEXT:    andq 160(%rdi), %r11
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 352(%rdi), %rdx
-; SSE-NEXT:    orq %r9, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 96(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 480(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 224(%rdi), %r8
-; SSE-NEXT:    orq %rax, %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq 272(%rdi), %r14
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 16(%rdi), %rax
-; SSE-NEXT:    orq %r14, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 400(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 144(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 336(%rdi), %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 80(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 464(%rdi), %rdx
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 208(%rdi), %r11
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    orq %rax, %r11
-; SSE-NEXT:    orq %r8, %r11
-; SSE-NEXT:    movq (%rsp), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 304(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 48(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 432(%rdi), %r9
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 176(%rdi), %r8
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 368(%rdi), %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 112(%rdi), %rax
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    movq %r8, %r10
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 496(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT:    andq 240(%rdi), %rbp
-; SSE-NEXT:    orq %r8, %rbp
-; SSE-NEXT:    orq %rax, %rbp
-; SSE-NEXT:    orq %r10, %rbp
-; SSE-NEXT:    orq %r11, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 392(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    andq 136(%rdi), %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 328(%rdi), %rdx
-; SSE-NEXT:    orq %rax, %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 72(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 456(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE-NEXT:    andq 200(%rdi), %r13
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    orq %rdx, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 296(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 40(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 424(%rdi), %r8
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 168(%rdi), %rdx
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 360(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 104(%rdi), %rax
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 488(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    andq 232(%rdi), %r15
-; SSE-NEXT:    orq %rax, %r15
-; SSE-NEXT:    orq %r8, %r15
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 280(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %r15
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 408(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 152(%rdi), %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 344(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 88(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 472(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT:    andq 216(%rdi), %r14
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    orq %rax, %r14
-; SSE-NEXT:    orq %r8, %r14
-; SSE-NEXT:    orq %r10, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 312(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 440(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 184(%rdi), %r9
-; SSE-NEXT:    orq %r11, %r10
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    orq %r10, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    andq 376(%rdi), %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 120(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 504(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 248(%rdi), %r8
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    movq 1056(%rsp,%rsi), %rax
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    andq 256(%rdi), %rdx
-; SSE-NEXT:    orq %r14, %r8
-; SSE-NEXT:    andq (%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbp, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    andq 264(%rdi), %rcx
-; SSE-NEXT:    andq 8(%rdi), %rbx
-; SSE-NEXT:    orq %rcx, %rbx
-; SSE-NEXT:    orq %r12, %rbx
-; SSE-NEXT:    orq %r13, %rbx
-; SSE-NEXT:    orq %r15, %rbx
-; SSE-NEXT:    orq %r8, %rbx
-; SSE-NEXT:    orq %rax, %rbx
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $1576, %rsp # imm = 0x628
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i4096:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $1560, %rsp # imm = 0x618
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl %esi, %eax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $4032, %eax # imm = 0xFC0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    movslq %eax, %rsi
-; AVX2-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1288(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1544(%rsp,%rsi), %rax
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1160(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1416(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1216(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    movq 1224(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1480(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1088(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1096(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1352(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1248(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1256(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1512(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1120(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1128(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1376(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1384(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1192(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1440(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1448(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1056(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1064(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1312(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1320(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1272(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1528(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1144(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1400(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1200(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq 1208(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %r11, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1464(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1072(%rsp,%rsi), %r12
-; AVX2-NEXT:    movq 1080(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %r12, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1336(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1232(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1240(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1488(%rsp,%rsi), %rbp
-; AVX2-NEXT:    movq 1496(%rsp,%rsi), %rax
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    movl %esi, %ebx
+; AVX2-NEXT:    shrl $3, %ebx
+; AVX2-NEXT:    movl %ebx, %eax
+; AVX2-NEXT:    andl $56, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    movslq %eax, %r11
+; AVX2-NEXT:    movq 128(%rsp,%r11), %r15
+; AVX2-NEXT:    movq 136(%rsp,%r11), %rax
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    shldq %cl, %r15, %rsi
+; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 120(%rsp,%r11), %r8
+; AVX2-NEXT:    shldq %cl, %r8, %r15
+; AVX2-NEXT:    movq 144(%rsp,%r11), %r14
+; AVX2-NEXT:    movq 152(%rsp,%r11), %rsi
+; AVX2-NEXT:    movq %rsi, %r9
+; AVX2-NEXT:    shldq %cl, %r14, %r9
+; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    shldq %cl, %rax, %r14
+; AVX2-NEXT:    movq 112(%rsp,%r11), %rax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq 160(%rsp,%r11), %r13
+; AVX2-NEXT:    movq 168(%rsp,%r11), %r12
+; AVX2-NEXT:    shldq %cl, %r13, %r12
+; AVX2-NEXT:    shldq %cl, %rsi, %r13
+; AVX2-NEXT:    shldq %cl, %rax, %r8
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movl %edx, %eax
+; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq 24(%rsp,%r11), %rbp
+; AVX2-NEXT:    movq 32(%rsp,%r11), %rdx
+; AVX2-NEXT:    movq %rdx, %rax
 ; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1104(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq 1112(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1360(%rsp,%rsi), %r10
-; AVX2-NEXT:    movq 1368(%rsp,%rsi), %r8
-; AVX2-NEXT:    movq %r8, %rdx
-; AVX2-NEXT:    shldq %cl, %r10, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1168(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1176(%rsp,%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rdx
-; AVX2-NEXT:    shldq %cl, %r9, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1424(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq 1432(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, %r14
-; AVX2-NEXT:    shldq %cl, %r9, %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1296(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1304(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq %r14, %r13
-; AVX2-NEXT:    shldq %cl, %r15, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq 1048(%rsp,%rsi), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, %r9
-; AVX2-NEXT:    andq 384(%rdi), %r9
-; AVX2-NEXT:    andq 128(%rdi), %r14
-; AVX2-NEXT:    andq 320(%rdi), %r10
-; AVX2-NEXT:    orq %r9, %r14
-; AVX2-NEXT:    movq %r14, %r15
-; AVX2-NEXT:    andq 64(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    andq 448(%rdi), %rbp
-; AVX2-NEXT:    andq 192(%rdi), %r13
-; AVX2-NEXT:    orq %rbp, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq 288(%rdi), %r8
-; AVX2-NEXT:    andq 32(%rdi), %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 416(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %r12
-; AVX2-NEXT:    andq 160(%rdi), %r11
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    andq 352(%rdi), %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 96(%rdi), %rax
-; AVX2-NEXT:    orq %r12, %r11
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 480(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    andq 224(%rdi), %r13
-; AVX2-NEXT:    orq %r10, %r13
+; AVX2-NEXT:    movq 40(%rsp,%r11), %r10
+; AVX2-NEXT:    shldq %cl, %rdx, %r10
+; AVX2-NEXT:    movq 8(%rsp,%r11), %r9
+; AVX2-NEXT:    movq 16(%rsp,%r11), %rdx
+; AVX2-NEXT:    movq %rdx, %r8
+; AVX2-NEXT:    shldq %cl, %r9, %r8
+; AVX2-NEXT:    shldq %cl, %rdx, %rbp
+; AVX2-NEXT:    andnq 48(%rdi), %r13, %r13
 ; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 272(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 16(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r13
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 400(%rdi), %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 144(%rdi), %rax
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 336(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 80(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 464(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 208(%rdi), %r11
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    orq %r8, %r11
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    orq %r9, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 304(%rdi), %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 48(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 432(%rdi), %r10
-; AVX2-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 176(%rdi), %rax
-; AVX2-NEXT:    orq %r9, %r8
-; AVX2-NEXT:    movq %r8, %r9
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 368(%rdi), %r8
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 112(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 496(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 240(%rdi), %r9
-; AVX2-NEXT:    orq %r8, %r9
-; AVX2-NEXT:    orq %rax, %r9
-; AVX2-NEXT:    orq %r10, %r9
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 392(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    andq 136(%rdi), %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 328(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 72(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rbp
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 456(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX2-NEXT:    andq 200(%rdi), %r12
-; AVX2-NEXT:    orq %rax, %r12
-; AVX2-NEXT:    orq %r8, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 296(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 40(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 424(%rdi), %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 168(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %r8
-; AVX2-NEXT:    movq %r8, %r10
-; AVX2-NEXT:    orq %r11, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 360(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 104(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 488(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    andq 232(%rdi), %r14
-; AVX2-NEXT:    orq %rax, %r14
+; AVX2-NEXT:    movq -8(%rsp,%r11), %rax
+; AVX2-NEXT:    movq (%rsp,%r11), %rdx
+; AVX2-NEXT:    movq %rdx, %rsi
+; AVX2-NEXT:    shldq %cl, %rax, %rsi
+; AVX2-NEXT:    shldq %cl, %rdx, %r9
+; AVX2-NEXT:    andnq 56(%rdi), %r12, %r12
+; AVX2-NEXT:    andnq 32(%rdi), %r14, %r14
+; AVX2-NEXT:    orq %r10, %r12
 ; AVX2-NEXT:    orq %r8, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 280(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 24(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 408(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 152(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 344(%rdi), %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 88(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 472(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    andq 216(%rdi), %rbx
-; AVX2-NEXT:    orq %rax, %rbx
-; AVX2-NEXT:    orq %r8, %rbx
-; AVX2-NEXT:    orq %r10, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 312(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 56(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 440(%rdi), %r10
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 184(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT:    andnq 40(%rdi), %rdx, %rdx
+; AVX2-NEXT:    orq %rbp, %rdx
+; AVX2-NEXT:    shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT:    movq -16(%rsp,%r11), %r10
+; AVX2-NEXT:    shlxq %rcx, %r10, %r11
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT:    shldq %cl, %r10, %rax
+; AVX2-NEXT:    andnq 16(%rdi), %r15, %rcx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 376(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 120(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    movq %r8, %r11
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 504(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 248(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    orq %r10, %rax
+; AVX2-NEXT:    andnq 24(%rdi), %r10, %r10
+; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    orq %r9, %r10
+; AVX2-NEXT:    andnq (%rdi), %r8, %rsi
+; AVX2-NEXT:    orq %r11, %rsi
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r8, %r10
-; AVX2-NEXT:    orq %r11, %rax
-; AVX2-NEXT:    movq 1040(%rsp,%rsi), %rsi
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX2-NEXT:    andq 256(%rdi), %r10
-; AVX2-NEXT:    andq (%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    orq %r15, %rax
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r13, %rax
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andq 264(%rdi), %rcx
-; AVX2-NEXT:    andq 8(%rdi), %rdx
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    orq %rbp, %rdx
-; AVX2-NEXT:    orq %r12, %rdx
-; AVX2-NEXT:    orq %r14, %rdx
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $1560, %rsp # imm = 0x618
+; AVX2-NEXT:    andnq 8(%rdi), %r8, %r8
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    andl $60, %ebx
+; AVX2-NEXT:    movl (%rdi,%rbx), %eax
+; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX2-NEXT:    btl %r9d, %eax
+; AVX2-NEXT:    movq %r13, 48(%rdi)
+; AVX2-NEXT:    movq %r12, 56(%rdi)
+; AVX2-NEXT:    movq %r14, 32(%rdi)
+; AVX2-NEXT:    movq %rdx, 40(%rdi)
+; AVX2-NEXT:    movq %rcx, 16(%rdi)
+; AVX2-NEXT:    movq %r10, 24(%rdi)
+; AVX2-NEXT:    movq %rsi, (%rdi)
+; AVX2-NEXT:    movq %r8, 8(%rdi)
+; AVX2-NEXT:    setae %al
+; AVX2-NEXT:    addq $184, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
 ; AVX2-NEXT:    popq %r13
@@ -6518,7 +1532,7 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_ne_i4096:
+; AVX512-LABEL: init_eq_i512:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %r15
@@ -6526,489 +1540,102 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 ; AVX512-NEXT:    pushq %r13
 ; AVX512-NEXT:    pushq %r12
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $1560, %rsp # imm = 0x618
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl %esi, %eax
-; AVX512-NEXT:    andl $4032, %eax # imm = 0xFC0
+; AVX512-NEXT:    subq $168, %rsp
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512-NEXT:    movl %esi, %ecx
 ; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %eax
-; AVX512-NEXT:    negl %eax
-; AVX512-NEXT:    movslq %eax, %rsi
-; AVX512-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1288(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1544(%rsp,%rsi), %rax
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1160(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1416(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1216(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    movq 1224(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1480(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1088(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1096(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1352(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1248(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1256(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1512(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1120(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1128(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1376(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1384(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1192(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1440(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1448(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1056(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1064(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1312(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1320(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1272(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1528(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1144(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1400(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1200(%rsp,%rsi), %r10
-; AVX512-NEXT:    movq 1208(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1464(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1072(%rsp,%rsi), %r14
-; AVX512-NEXT:    movq 1080(%rsp,%rsi), %rax
+; AVX512-NEXT:    movl %esi, %r10d
+; AVX512-NEXT:    shrl $3, %r10d
+; AVX512-NEXT:    movl %r10d, %r8d
+; AVX512-NEXT:    andl $56, %r8d
+; AVX512-NEXT:    negl %r8d
+; AVX512-NEXT:    movslq %r8d, %r9
+; AVX512-NEXT:    movq 112(%rsp,%r9), %r11
+; AVX512-NEXT:    movq 120(%rsp,%r9), %r14
+; AVX512-NEXT:    movq %r14, %rax
+; AVX512-NEXT:    shldq %cl, %r11, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq 104(%rsp,%r9), %rax
+; AVX512-NEXT:    shldq %cl, %rax, %r11
+; AVX512-NEXT:    movq 128(%rsp,%r9), %r15
+; AVX512-NEXT:    movq 136(%rsp,%r9), %rbp
+; AVX512-NEXT:    movq %rbp, %rbx
+; AVX512-NEXT:    shldq %cl, %r15, %rbx
+; AVX512-NEXT:    shldq %cl, %r14, %r15
+; AVX512-NEXT:    movq 144(%rsp,%r9), %r13
+; AVX512-NEXT:    movq 152(%rsp,%r9), %r12
+; AVX512-NEXT:    shldq %cl, %r13, %r12
+; AVX512-NEXT:    movq 96(%rsp,%r9), %r14
+; AVX512-NEXT:    shldq %cl, %rbp, %r13
 ; AVX512-NEXT:    shldq %cl, %r14, %rax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1336(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1232(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1240(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1488(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq 1496(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r12, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1104(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq 1112(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movl %edx, %edx
+; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq 8(%rsp,%r9), %r8
+; AVX512-NEXT:    movq 16(%rsp,%r9), %rax
+; AVX512-NEXT:    movq %rax, %rbp
+; AVX512-NEXT:    shldq %cl, %r8, %rbp
+; AVX512-NEXT:    andnq 48(%rdi), %r13, %r13
+; AVX512-NEXT:    orq %rbp, %r13
+; AVX512-NEXT:    movq 24(%rsp,%r9), %rbp
+; AVX512-NEXT:    shldq %cl, %rax, %rbp
+; AVX512-NEXT:    movq -8(%rsp,%r9), %rax
+; AVX512-NEXT:    movq (%rsp,%r9), %rsi
+; AVX512-NEXT:    movq %rsi, %rdx
 ; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1360(%rsp,%rsi), %r11
-; AVX512-NEXT:    movq 1368(%rsp,%rsi), %rbx
-; AVX512-NEXT:    movq %rbx, %rdx
-; AVX512-NEXT:    shldq %cl, %r11, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1168(%rsp,%rsi), %r9
-; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1176(%rsp,%rsi), %r8
-; AVX512-NEXT:    movq %r8, %rdx
-; AVX512-NEXT:    shldq %cl, %r9, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1424(%rsp,%rsi), %r9
-; AVX512-NEXT:    movq 1432(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, %r15
-; AVX512-NEXT:    shldq %cl, %r9, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1296(%rsp,%rsi), %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1304(%rsp,%rsi), %r15
-; AVX512-NEXT:    movq %r15, %r13
-; AVX512-NEXT:    shldq %cl, %rbp, %r13
-; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq 1048(%rsp,%rsi), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbp, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    andq 384(%rdi), %r9
-; AVX512-NEXT:    andq 128(%rdi), %r15
-; AVX512-NEXT:    orq %r9, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq 320(%rdi), %r11
-; AVX512-NEXT:    andq 64(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %rax
-; AVX512-NEXT:    andq 448(%rdi), %r12
-; AVX512-NEXT:    andq 192(%rdi), %r13
-; AVX512-NEXT:    orq %r12, %r13
-; AVX512-NEXT:    orq %rax, %r13
-; AVX512-NEXT:    andq 288(%rdi), %r8
-; AVX512-NEXT:    andq 32(%rdi), %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 416(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %r14
-; AVX512-NEXT:    andq 160(%rdi), %r10
-; AVX512-NEXT:    orq %rax, %r10
-; AVX512-NEXT:    andq 352(%rdi), %rbx
-; AVX512-NEXT:    orq %r14, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 96(%rdi), %rax
-; AVX512-NEXT:    orq %rbx, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 480(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    andq 224(%rdi), %r15
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    orq %r8, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 272(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 16(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 400(%rdi), %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 144(%rdi), %rax
-; AVX512-NEXT:    orq %r9, %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 336(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 80(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 464(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    andq 208(%rdi), %r11
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    orq %r8, %r11
-; AVX512-NEXT:    orq %rax, %r11
-; AVX512-NEXT:    orq %r9, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 304(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 48(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 432(%rdi), %r9
-; AVX512-NEXT:    movq (%rsp), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 176(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 368(%rdi), %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 112(%rdi), %rax
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    movq %r8, %r10
-; AVX512-NEXT:    orq %r9, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 496(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 240(%rdi), %r9
-; AVX512-NEXT:    orq %r8, %r9
-; AVX512-NEXT:    orq %rax, %r9
-; AVX512-NEXT:    orq %r10, %r9
-; AVX512-NEXT:    orq %r11, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 392(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    andq 136(%rdi), %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 328(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 72(%rdi), %rax
-; AVX512-NEXT:    orq %r10, %rbp
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 456(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT:    andq 200(%rdi), %r12
-; AVX512-NEXT:    orq %rax, %r12
-; AVX512-NEXT:    orq %r8, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 296(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 40(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 424(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 168(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 360(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 104(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 488(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    andq 232(%rdi), %r14
-; AVX512-NEXT:    orq %rax, %r14
-; AVX512-NEXT:    orq %r8, %r14
-; AVX512-NEXT:    orq %r10, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 280(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 24(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 408(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 152(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    andq 344(%rdi), %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 88(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 472(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    andq 216(%rdi), %rbx
-; AVX512-NEXT:    orq %rax, %rbx
+; AVX512-NEXT:    andnq 56(%rdi), %r12, %r12
+; AVX512-NEXT:    orq %rbp, %r12
+; AVX512-NEXT:    andnq 32(%rdi), %r15, %r15
+; AVX512-NEXT:    orq %rdx, %r15
+; AVX512-NEXT:    shldq %cl, %rsi, %r8
+; AVX512-NEXT:    movq -24(%rsp,%r9), %rdx
+; AVX512-NEXT:    movq -16(%rsp,%r9), %rsi
+; AVX512-NEXT:    movq %rsi, %rbp
+; AVX512-NEXT:    shldq %cl, %rdx, %rbp
+; AVX512-NEXT:    andnq 40(%rdi), %rbx, %rbx
 ; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    orq %r10, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 312(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 56(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 440(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 184(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 376(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 120(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 504(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 248(%rdi), %r8
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    orq %r11, %r8
-; AVX512-NEXT:    movq 1040(%rsp,%rsi), %rax
+; AVX512-NEXT:    andnq 16(%rdi), %r11, %r8
+; AVX512-NEXT:    orq %rbp, %r8
+; AVX512-NEXT:    shlxq %rcx, %r14, %r11
+; AVX512-NEXT:    movq -32(%rsp,%r9), %r9
+; AVX512-NEXT:    shldq %cl, %rsi, %rax
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rsi, %r10
-; AVX512-NEXT:    orq %rbx, %r8
-; AVX512-NEXT:    shlxq %rcx, %rax, %rsi
-; AVX512-NEXT:    andq 256(%rdi), %r10
-; AVX512-NEXT:    andq (%rdi), %rsi
-; AVX512-NEXT:    orq %r10, %rsi
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    orq %r13, %rsi
-; AVX512-NEXT:    orq %r15, %rsi
+; AVX512-NEXT:    andnq 24(%rdi), %rsi, %rsi
+; AVX512-NEXT:    orq %rax, %rsi
+; AVX512-NEXT:    shlxq %rcx, %r9, %rax
 ; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    orq %r9, %rsi
+; AVX512-NEXT:    shldq %cl, %r9, %rdx
+; AVX512-NEXT:    andnq (%rdi), %r11, %rcx
+; AVX512-NEXT:    orq %rax, %rcx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 264(%rdi), %rax
-; AVX512-NEXT:    andq 8(%rdi), %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    orq %rbp, %rdx
-; AVX512-NEXT:    orq %r12, %rdx
-; AVX512-NEXT:    orq %r14, %rdx
-; AVX512-NEXT:    orq %r8, %rdx
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $1560, %rsp # imm = 0x618
+; AVX512-NEXT:    andnq 8(%rdi), %rax, %rax
+; AVX512-NEXT:    orq %rdx, %rax
+; AVX512-NEXT:    andl $60, %r10d
+; AVX512-NEXT:    movl (%rdi,%r10), %edx
+; AVX512-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX512-NEXT:    btl %r9d, %edx
+; AVX512-NEXT:    movq %r13, 48(%rdi)
+; AVX512-NEXT:    movq %r12, 56(%rdi)
+; AVX512-NEXT:    movq %r15, 32(%rdi)
+; AVX512-NEXT:    movq %rbx, 40(%rdi)
+; AVX512-NEXT:    movq %r8, 16(%rdi)
+; AVX512-NEXT:    movq %rsi, 24(%rdi)
+; AVX512-NEXT:    movq %rcx, (%rdi)
+; AVX512-NEXT:    movq %rax, 8(%rdi)
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    addq $168, %rsp
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
 ; AVX512-NEXT:    popq %r13
@@ -7017,6 +1644,45 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 ; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
+  %rem = and i32 %position, 511
+  %ofs = zext nneg i32 %rem to i512
+  %bit = shl nuw i512 1, %ofs
+  %mask = xor i512 %bit, -1
+  %val0 = zext i1 %value to i512
+  %val = shl nuw i512 %val0, %ofs
+  %ld = load i512, ptr %word
+  %test = and i512 %ld, %bit
+  %res0 = and i512 %ld, %mask
+  %res = or i512 %res0, %val
+  %cmp = icmp eq i512 %test, 0
+  store i512 %res, ptr %word
+  ret i1 %cmp
+}
+
+; i4096
+
+define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
+; X86-LABEL: test_ne_i4096:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $4064, %edx # imm = 0xFE0
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_ne_i4096:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $4064, %eax # imm = 0xFE0
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 4095
   %ofs = zext nneg i32 %rem to i4096
   %bit = shl nuw i4096 1, %ofs
@@ -7161,8 +1827,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -7176,51 +1842,33 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl 8(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl (%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl 12(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl 40(%esp,%eax), %edx
+; X86-NEXT:    movl 44(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl 32(%esp,%eax), %edi
+; X86-NEXT:    movl 36(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %ebx, 8(%esi)
-; X86-NEXT:    movl %ecx, 12(%esi)
-; X86-NEXT:    movl %edi, (%esi)
-; X86-NEXT:    movl %edx, 4(%esi)
-; X86-NEXT:    je .LBB22_2
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    andl %ebx, 4(%eax)
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    notl %edi
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl $96, %ebx
+; X86-NEXT:    shrl $3, %ebx
+; X86-NEXT:    movl (%eax,%ebx), %ebx
+; X86-NEXT:    andl %edi, (%eax)
+; X86-NEXT:    notl %esi
+; X86-NEXT:    andl %esi, 12(%eax)
+; X86-NEXT:    notl %edx
+; X86-NEXT:    andl %edx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    btl %ecx, %ebx
+; X86-NEXT:    jae .LBB22_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB22_2:
@@ -7242,52 +1890,75 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; SSE-NEXT:    testb $64, %cl
 ; SSE-NEXT:    cmovneq %rsi, %r8
 ; SSE-NEXT:    cmovneq %rax, %rsi
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    andq %r8, %r10
 ; SSE-NEXT:    notq %r8
-; SSE-NEXT:    movq %rcx, %r11
-; SSE-NEXT:    andq %rsi, %r11
 ; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq %r9, %r8
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    orq %r10, %r11
-; SSE-NEXT:    jne .LBB22_2
+; SSE-NEXT:    movl %ecx, %r9d
+; SSE-NEXT:    andl $96, %r9d
+; SSE-NEXT:    shrl $3, %r9d
+; SSE-NEXT:    movl (%rdi,%r9), %r9d
+; SSE-NEXT:    btl %ecx, %r9d
+; SSE-NEXT:    jb .LBB22_2
 ; SSE-NEXT:  # %bb.1:
 ; SSE-NEXT:    movl (%rdx), %eax
 ; SSE-NEXT:  .LBB22_2:
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
+; SSE-NEXT:    andq %rsi, (%rdi)
+; SSE-NEXT:    andq %r8, 8(%rdi)
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: reset_multiload_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    movl $1, %esi
-; AVX-NEXT:    xorl %r8d, %r8d
-; AVX-NEXT:    shldq %cl, %rsi, %r8
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    shlxq %rcx, %rsi, %r9
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %r9, %r8
-; AVX-NEXT:    cmovneq %rax, %r9
-; AVX-NEXT:    movq (%rdi), %r10
-; AVX-NEXT:    movq 8(%rdi), %r11
-; AVX-NEXT:    andnq %r11, %r8, %rcx
-; AVX-NEXT:    andq %r8, %r11
-; AVX-NEXT:    andnq %r10, %r9, %rsi
-; AVX-NEXT:    andq %r9, %r10
-; AVX-NEXT:    orq %r11, %r10
-; AVX-NEXT:    jne .LBB22_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    movl (%rdx), %eax
-; AVX-NEXT:  .LBB22_2:
-; AVX-NEXT:    movq %rsi, (%rdi)
-; AVX-NEXT:    movq %rcx, 8(%rdi)
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
+; AVX2-LABEL: reset_multiload_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    movl $1, %r8d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %r8, %rsi
+; AVX2-NEXT:    shlxq %rcx, %r8, %r8
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %r8, %rsi
+; AVX2-NEXT:    cmovneq %rax, %r8
+; AVX2-NEXT:    notq %rsi
+; AVX2-NEXT:    notq %r8
+; AVX2-NEXT:    movl %ecx, %r9d
+; AVX2-NEXT:    andl $96, %r9d
+; AVX2-NEXT:    shrl $3, %r9d
+; AVX2-NEXT:    movl (%rdi,%r9), %r9d
+; AVX2-NEXT:    btl %ecx, %r9d
+; AVX2-NEXT:    jb .LBB22_2
+; AVX2-NEXT:  # %bb.1:
+; AVX2-NEXT:    movl (%rdx), %eax
+; AVX2-NEXT:  .LBB22_2:
+; AVX2-NEXT:    andq %r8, (%rdi)
+; AVX2-NEXT:    andq %rsi, 8(%rdi)
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: reset_multiload_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %r8d
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %r8, %rsi
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    shlxq %rcx, %r8, %r8
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %r8, %rsi
+; AVX512-NEXT:    cmovneq %rax, %r8
+; AVX512-NEXT:    notq %rsi
+; AVX512-NEXT:    notq %r8
+; AVX512-NEXT:    movl %ecx, %r9d
+; AVX512-NEXT:    andl $96, %r9d
+; AVX512-NEXT:    shrl $3, %r9d
+; AVX512-NEXT:    movl (%rdi,%r9), %r9d
+; AVX512-NEXT:    btl %ecx, %r9d
+; AVX512-NEXT:    jb .LBB22_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    movl (%rdx), %eax
+; AVX512-NEXT:  .LBB22_2:
+; AVX512-NEXT:    andq %r8, (%rdi)
+; AVX512-NEXT:    andq %rsi, 8(%rdi)
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs

From ce925820d8a3ebc082a920d6cd23a40adefa0c5b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 3 Nov 2025 16:24:20 +0000
Subject: [PATCH 028/313] [VPlan] Use operands() driectly in
 VPInstruction::clone() (NFC).

There's no need to create temporary SmallVectors.
---
 llvm/lib/Transforms/Vectorize/VPlan.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1da070a1fb7f..cfe1f1e9d7528 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1110,9 +1110,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
 
   VPInstruction *clone() override {
-    SmallVector<VPValue *, 2> Operands(operands());
-    auto *New =
-        new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name);
+    auto *New = new VPInstruction(Opcode, operands(), *this, *this,
+                                  getDebugLoc(), Name);
     if (getUnderlyingValue())
       New->setUnderlyingValue(getUnderlyingInstr());
     return New;
@@ -1226,10 +1225,9 @@ class VPInstructionWithType : public VPInstruction {
   }
 
   VPInstruction *clone() override {
-    SmallVector<VPValue *, 2> Operands(operands());
     auto *New =
-        new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this,
-                                  getDebugLoc(), getName());
+        new VPInstructionWithType(getOpcode(), operands(), getResultType(),
+                                  *this, getDebugLoc(), getName());
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }

From af68efc9c49383c65b6f2bc800ea40b06b66b983 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Mon, 3 Nov 2025 17:33:20 +0100
Subject: [PATCH 029/313] Revert
 "[AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr
 instruction with inline-asm" (#166186)

Reverts llvm/llvm-project#152161

Need to revert to fix changed logic for the expensive checks.
---
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp  |  89 +++---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp |  19 +-
 llvm/test/CodeGen/AMDGPU/callbr.ll            |  54 ----
 ...nify-divergent-exit-nodes-with-musttail.ll |  51 ----
 llvm/test/CodeGen/AMDGPU/infinite-loop.ll     | 257 ++----------------
 .../si-annotate-nested-control-flows.ll       | 100 +------
 .../si-unify-exit-multiple-unreachables.ll    | 161 +----------
 llvm/test/CodeGen/AMDGPU/update-phi.ll        |  39 ---
 llvm/test/Transforms/StructurizeCFG/callbr.ll | 235 ----------------
 9 files changed, 79 insertions(+), 926 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll
 delete mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 706237b906cc3..ddf9a24eb5230 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,52 +181,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
   return NewRetBlock;
 }
 
-static BasicBlock *
-createDummyReturnBlock(Function &F,
-                       SmallVector<BasicBlock *, 4> &ReturningBlocks) {
-  BasicBlock *DummyReturnBB =
-      BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
-  Type *RetTy = F.getReturnType();
-  Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
-  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
-  ReturningBlocks.push_back(DummyReturnBB);
-  return DummyReturnBB;
-}
-
-/// Handle conditional branch instructions (-> 2 targets) and callbr
-/// instructions with N targets.
-static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
-                          BasicBlock *DummyReturnBB,
-                          std::vector<DominatorTree::UpdateType> &Updates) {
-  SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
-  // Create a new transition block to hold the conditional branch.
-  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
-  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
-  // 'Successors' become successors of TransitionBB instead of BB,
-  // and TransitionBB becomes a single successor of BB.
-  Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
-  for (BasicBlock *Successor : Successors) {
-    Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-    Updates.emplace_back(DominatorTree::Delete, BB, Successor);
-  }
-
-  // Create a branch that will always branch to the transition block and
-  // references DummyReturnBB.
-  BB->getTerminator()->eraseFromParent();
-  BranchInst::Create(TransitionBB, DummyReturnBB,
-                     ConstantInt::getTrue(F.getContext()), BB);
-  Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-}
-
 bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
                                             const PostDominatorTree &PDT,
                                             const UniformityInfo &UA) {
+  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
+
   if (PDT.root_size() == 0 ||
       (PDT.root_size() == 1 &&
-       !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
+       !isa<BranchInst>(PDT.getRoot()->getTerminator())))
     return false;
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
@@ -260,27 +222,46 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       if (HasDivergentExitBlock)
         UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-      if (!DummyReturnBB)
-        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
+      if (DummyReturnBB == nullptr) {
+        DummyReturnBB =
+            BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+        Type *RetTy = F.getReturnType();
+        Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+        ReturningBlocks.push_back(DummyReturnBB);
+      }
 
       if (BI->isUnconditional()) {
         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
-        BranchInst::Create(LoopHeaderBB, DummyReturnBB,
-                           ConstantInt::getTrue(F.getContext()), BB);
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
+        Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+      } else { // Conditional branch.
+        SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+        // Create a new transition block to hold the conditional branch.
+        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+        // 'Successors' become successors of TransitionBB instead of BB,
+        // and TransitionBB becomes a single successor of BB.
+        Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+        for (BasicBlock *Successor : Successors) {
+          Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+          Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+        }
+
+        // Create a branch that will always branch to the transition block and
+        // references DummyReturnBB.
+        BB->getTerminator()->eraseFromParent();
+        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
         Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-      } else {
-        handleNBranch(F, BB, BI, DummyReturnBB, Updates);
       }
       Changed = true;
-    } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
-      if (!DummyReturnBB)
-        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
-
-      handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
-    } else {
-      llvm_unreachable("unsupported block terminator");
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0a8f5ea2fdae1..5f6f66a4bc213 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,10 +558,11 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   } else {
     // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
-    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
-      for (BasicBlock *Succ : Term->successors())
-        if (Visited.count(Succ))
-          Loops[Succ] = BB;
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (BasicBlock *Succ : Term->successors())
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
   }
 }
 
@@ -593,7 +594,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
-    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
+    if (!ParentRegion->contains(P))
       continue;
 
     Region *R = RI->getRegionFor(P);
@@ -1401,17 +1402,13 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 /// Run the transformation for each region found
 bool StructurizeCFG::run(Region *R, DominatorTree *DT,
                          const TargetTransformInfo *TTI) {
-  // CallBr and its corresponding direct target blocks are for now ignored by
-  // this pass. This is not a limitation for the currently intended uses cases
-  // of callbr in the AMDGPU backend.
-  // Parent and child regions are not affected by this (current) restriction.
-  // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
-  if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
+  if (R->isTopLevelRegion())
     return false;
 
   this->DT = DT;
   this->TTI = TTI;
   Func = R->getEntry()->getParent();
+  assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
   ParentRegion = R;
 
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
deleted file mode 100644
index 253a6ec100eae..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/callbr.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
-
-define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
-; CHECK-LABEL: callbr_inline_asm:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v0, v[0:1]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  ; %bb.1: ; %fallthrough
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[2:3], v0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target
-; CHECK-NEXT:    ; %indirect
-; CHECK-NEXT:    ; Label of block must be emitted
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[4:5], v0
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-	%a = load i32, ptr %src, align 4
-	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
-fallthrough:
-	store i32 %a, ptr %dst1, align 4
-	br label %ret
-indirect:
-	store i32 %a, ptr %dst2, align 4
-	br label %ret
-ret:
-	ret void
-}
-
-define void @callbr_self_loop(i1 %c) {
-; CHECK-LABEL: callbr_self_loop:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:  .LBB1_1: ; %callbr
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_branch .LBB1_1
-; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target
-; CHECK-NEXT:    ; %callbr.target.ret
-; CHECK-NEXT:    ; Label of block must be emitted
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
-  br label %callbr
-callbr:
-  callbr void asm "", "!i"() to label %callbr [label %ret]
-ret:
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 076a99ff8588f..007e3f0a6bdbc 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,7 +3,6 @@
 
 declare void @foo(ptr)
 declare i1 @bar(ptr)
-declare i32 @bar32(ptr)
 
 define void @musttail_call_without_return_value(ptr %p) {
 ; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -29,31 +28,6 @@ bb.1:
   ret void
 }
 
-define void @musttail_call_without_return_value_callbr(ptr %p) {
-; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
-; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
-; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
-; CHECK:       [[BB_0]]:
-; CHECK-NEXT:    musttail call void @foo(ptr [[P]])
-; CHECK-NEXT:    ret void
-; CHECK:       [[BB_1:.*:]]
-; CHECK-NEXT:    ret void
-;
-entry:
-  %load = load i32, ptr %p, align 1
-  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
-
-bb.0:
-  musttail call void @foo(ptr %p)
-  ret void
-
-bb.1:
-  ret void
-}
-
 define i1 @musttail_call_with_return_value(ptr %p) {
 ; CHECK-LABEL: define i1 @musttail_call_with_return_value(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -77,28 +51,3 @@ bb.0:
 bb.1:
   ret i1 %load
 }
-
-define i32 @musttail_call_with_return_value_callbr(ptr %p) {
-; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
-; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
-; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
-; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
-; CHECK:       [[BB_0]]:
-; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
-; CHECK-NEXT:    ret i32 [[RET]]
-; CHECK:       [[BB_1:.*:]]
-; CHECK-NEXT:    ret i32 [[LOAD]]
-;
-entry:
-  %load = load i32, ptr %p, align 1
-  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
-
-bb.0:
-  %ret = musttail call i32 @bar32(ptr %p)
-  ret i32 %ret
-
-bb.1:
-  ret i32 %load
-}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index df635925b87df..3e2e43faca5aa 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,60 +36,26 @@ loop:
   br label %loop
 }
 
-define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_callbr:
-; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    ;;#ASMSTART
-; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
-; IR-LABEL: @infinite_loop_callbr(
-; IR-NEXT:  entry:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[LOOP:%.*]] []
-; IR:       loop:
-; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR:       TransitionBlock:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[LOOP]] []
-; IR:       DummyReturnBlock:
-; IR-NEXT:    ret void
-;
-entry:
-  callbr void asm "", ""() to label %loop []
-
-loop:
-  store volatile i32 999, ptr addrspace(1) %out, align 4
-  callbr void asm "", ""() to label %loop []
-}
-
 define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB2_3
+; SI-NEXT:    s_cbranch_execz .LBB1_3
 ; SI-NEXT:  ; %bb.1: ; %loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB2_2: ; %loop
+; SI-NEXT:  .LBB1_2: ; %loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB2_2
-; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_cbranch_vccnz .LBB1_2
+; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_ret(
 ; IR-NEXT:  entry:
@@ -115,93 +81,44 @@ return:
   ret void
 }
 
-define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_ret_callbr:
-; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    ;;#ASMSTART
-; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:  ; %bb.1: ; %loop.preheader
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:  .LBB3_2: ; Inline asm indirect target
-; SI-NEXT:    ; %UnifiedReturnBlock
-; SI-NEXT:    ; Label of block must be emitted
-; SI-NEXT:    s_endpgm
-; IR-LABEL: @infinite_loop_ret_callbr(
-; IR-NEXT:  entry:
-; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
-; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32
-; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]])
-; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
-; IR:       loop:
-; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
-; IR:       TransitionBlock:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[LOOP]] []
-; IR:       UnifiedReturnBlock:
-; IR-NEXT:    ret void
-;
-entry:
-  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %cond = icmp eq i32 %tmp, 1
-  %cond32 = zext i1 %cond to i32
-  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
-
-loop:
-  store volatile i32 999, ptr addrspace(1) %out, align 4
-  callbr void asm "", ""() to label %loop []
-
-return:
-  ret void
-}
-
 define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_cbranch_scc1 .LBB4_4
+; SI-NEXT:    s_cbranch_scc1 .LBB2_4
 ; SI-NEXT:  ; %bb.1:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x378
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB4_2: ; %loop2
+; SI-NEXT:  .LBB2_2: ; %loop2
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB4_2
+; SI-NEXT:    s_cbranch_vccnz .LBB2_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB4_4: ; %Flow2
+; SI-NEXT:  .LBB2_4: ; %Flow2
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB4_7
+; SI-NEXT:    s_cbranch_vccz .LBB2_7
 ; SI-NEXT:  ; %bb.5:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:  .LBB4_6: ; %loop1
+; SI-NEXT:  .LBB2_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB4_6
-; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock
+; SI-NEXT:    s_cbranch_vccz .LBB2_6
+; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loops(
 ; IR-NEXT:  entry:
@@ -227,78 +144,24 @@ loop2:
   br label %loop2
 }
 
-define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loops_callbr:
-; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    ;;#ASMSTART
-; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:  ; %bb.1: ; %loop1
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
-; SI-NEXT:  .LBB5_2: ; Inline asm indirect target
-; SI-NEXT:    ; %loop2.preheader
-; SI-NEXT:    ; Label of block must be emitted
-; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x378
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_endpgm
-; IR-LABEL: @infinite_loops_callbr(
-; IR-NEXT:  entry:
-; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison)
-; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2]
-; IR:       loop1:
-; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR:       TransitionBlock:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[LOOP1]] []
-; IR:       loop2:
-; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
-; IR:       TransitionBlock1:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[LOOP2:%.*]] []
-; IR:       DummyReturnBlock:
-; IR-NEXT:    ret void
-;
-entry:
-  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
-
-loop1:
-  store volatile i32 999, ptr addrspace(1) %out, align 4
-  callbr void asm "", ""() to label %loop1 []
-
-loop2:
-  store volatile i32 888, ptr addrspace(1) %out, align 4
-  callbr void asm "", ""() to label %loop2 []
-}
-
 define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB6_5
+; SI-NEXT:    s_cbranch_execz .LBB3_5
 ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:  .LBB6_2: ; %outer_loop
+; SI-NEXT:  .LBB3_2: ; %outer_loop
 ; SI-NEXT:    ; =>This Loop Header: Depth=1
-; SI-NEXT:    ; Child Loop BB6_3 Depth 2
+; SI-NEXT:    ; Child Loop BB3_3 Depth 2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB6_3: ; %inner_loop
-; SI-NEXT:    ; Parent Loop BB6_2 Depth=1
+; SI-NEXT:  .LBB3_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; SI-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]
 ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -306,13 +169,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_cbranch_execnz .LBB6_3
+; SI-NEXT:    s_cbranch_execnz .LBB3_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1
+; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    s_branch .LBB6_2
-; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_branch .LBB3_2
+; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(
 ; IR-NEXT:  entry:
@@ -349,82 +212,4 @@ return:
   ret void
 }
 
-define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
-; SI-LABEL: infinite_loop_nest_ret_callbr:
-; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    ;;#ASMSTART
-; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:    s_and_b64 s[0:1], exec, 0
-; SI-NEXT:    s_branch .LBB7_3
-; SI-NEXT:  .LBB7_2: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
-; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
-; SI-NEXT:    s_cbranch_vccnz .LBB7_5
-; SI-NEXT:  .LBB7_3: ; %outer_loop
-; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    ;;#ASMSTART
-; SI-NEXT:    ;;#ASMEND
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_mov_b64 vcc, s[0:1]
-; SI-NEXT:    s_cbranch_vccz .LBB7_2
-; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop
-; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
-; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:    s_branch .LBB7_2
-; SI-NEXT:  .LBB7_5: ; Inline asm indirect target
-; SI-NEXT:    ; %UnifiedReturnBlock
-; SI-NEXT:    ; Label of block must be emitted
-; SI-NEXT:    s_endpgm
-; IR-LABEL: @infinite_loop_nest_ret_callbr(
-; IR-NEXT:  entry:
-; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
-; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
-; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
-; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]])
-; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
-; IR:       outer_loop:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[INNER_LOOP:%.*]] []
-; IR:       inner_loop:
-; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
-; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
-; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
-; IR:       TransitionBlock:
-; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]])
-; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop]
-; IR:       UnifiedReturnBlock:
-; IR-NEXT:    ret void
-;
-entry:
-  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination
-  %cond1_32 = zext i1 %cond1 to i32
-  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
-
-outer_loop:
-  ; %cond2 = icmp eq i32 %tmp, 2
-  ; br i1 %cond2, label %outer_loop, label %inner_loop
-  callbr void asm "", ""() to label %inner_loop []
-
-inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
-  store volatile i32 999, ptr addrspace(1) %out, align 4
-  %cond3 = icmp eq i32 %tmp, 3
-  %cond3_32 = zext i1 %cond3 to i32
-  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
-
-return:
-  ret void
-}
-
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 01bcdad3fc220..34de1e48bfb59 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,16 +3,15 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: define void @nested_inf_loop(
-; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    br label %[[BB1:.*]]
-; OPT:       [[BB1]]:
-; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
-; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
-; OPT:       [[INFLOOP]]:
-; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
-; OPT:       [[DUMMYRETURNBLOCK]]:
+; OPT-LABEL: @nested_inf_loop(
+; OPT-NEXT:  BB:
+; OPT-NEXT:    br label [[BB1:%.*]]
+; OPT:       BB1:
+; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
+; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
+; OPT:       infloop:
+; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
+; OPT:       DummyReturnBlock:
 ; OPT-NEXT:    ret void
 ;
 ; ISA-LABEL: nested_inf_loop:
@@ -64,84 +63,3 @@ BB4:
 BB3:
   br label %BB1
 }
-
-define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
-; OPT-LABEL: define void @nested_inf_loop_callbr(
-; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
-; OPT-NEXT:  [[BB:.*:]]
-; OPT-NEXT:    callbr void asm "", ""()
-; OPT-NEXT:            to label %[[BB1:.*]] []
-; OPT:       [[BB1]]:
-; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]])
-; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2]
-; OPT:       [[BB2:.*:]]
-; OPT-NEXT:    callbr void asm "", ""()
-; OPT-NEXT:            to label %[[BB4:.*]] []
-; OPT:       [[BB4]]:
-; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
-; OPT:       [[TRANSITIONBLOCK]]:
-; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]])
-; OPT-NEXT:            to label %[[BB3]] [label %BB4]
-; OPT:       [[BB3]]:
-; OPT-NEXT:    callbr void asm "", ""()
-; OPT-NEXT:            to label %[[BB1]] []
-; OPT:       [[DUMMYRETURNBLOCK]]:
-; OPT-NEXT:    ret void
-;
-; ISA-LABEL: nested_inf_loop_callbr:
-; ISA:       ; %bb.0: ; %BB
-; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ISA-NEXT:    ;;#ASMSTART
-; ISA-NEXT:    ;;#ASMEND
-; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; ISA-NEXT:  .LBB1_1: ; %BB1
-; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ISA-NEXT:    ;;#ASMSTART
-; ISA-NEXT:    ;;#ASMEND
-; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
-; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec
-; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
-; ISA-NEXT:  .LBB1_2: ; %BB3
-; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; ISA-NEXT:    ;;#ASMSTART
-; ISA-NEXT:    ;;#ASMEND
-; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
-; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec
-; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
-; ISA-NEXT:    s_branch .LBB1_1
-; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target
-; ISA-NEXT:    ; %BB2
-; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; ISA-NEXT:    ; Label of block must be emitted
-; ISA-NEXT:    ;;#ASMSTART
-; ISA-NEXT:    ;;#ASMEND
-; ISA-NEXT:    s_mov_b64 s[6:7], -1
-; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
-; ISA-NEXT:    s_cbranch_execz .LBB1_5
-; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
-; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
-; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
-; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
-; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; ISA-NEXT:    s_mov_b64 s[6:7], 0
-; ISA-NEXT:    s_cbranch_vccz .LBB1_2
-; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
-; ISA-NEXT:    s_setpc_b64 s[30:31]
-BB:
-  callbr void asm "", ""() to label %BB1 []
-
-BB1:
-  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
-
-BB2:
-  callbr void asm "", ""() to label %BB4 []
-
-BB4:
-  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
-
-BB3:
-  callbr void asm "", ""() to label %BB1 []
-}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 004c27971131d..4cbe682cf9f9f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
 
 declare void @llvm.trap()
@@ -70,33 +70,8 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_branch .LBB0_4
-; UNIFY-LABEL: @kernel(
-; UNIFY-NEXT:  entry:
-; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
-; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; UNIFY:       if.then:
-; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
-; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
-; UNIFY:       cond.false:
-; UNIFY-NEXT:    call void @llvm.trap()
-; UNIFY-NEXT:    unreachable
-; UNIFY:       if.else:
-; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
-; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
-; UNIFY:       if.then3:
-; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
-; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
-; UNIFY:       cond.false.i8:
-; UNIFY-NEXT:    call void @llvm.trap()
-; UNIFY-NEXT:    unreachable
-; UNIFY:       if.end6.sink.split:
-; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
-; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
-; UNIFY-NEXT:    br label [[IF_END6]]
-; UNIFY:       if.end6:
-; UNIFY-NEXT:    ret void
-;
+
+
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %n, 256
@@ -130,129 +105,5 @@ if.end6.sink.split:
 if.end6:
   ret void
 }
-
-define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
-; CHECK-LABEL: kernel_callbr:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10
-; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  ; %bb.1: ; %if.then
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
-; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target
-; CHECK-NEXT:    ; %UnifiedReturnBlock
-; CHECK-NEXT:    ; Label of block must be emitted
-; CHECK-NEXT:    s_endpgm
-; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target
-; CHECK-NEXT:    ; %if.else
-; CHECK-NEXT:    ; Label of block must be emitted
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:  ; %bb.5: ; %if.then3
-; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_branch .LBB1_2
-; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target
-; CHECK-NEXT:    ; %cond.false.i8
-; CHECK-NEXT:    ; Label of block must be emitted
-; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target
-; CHECK-NEXT:    ; %cond.false
-; CHECK-NEXT:    ; Label of block must be emitted
-; CHECK-NEXT:    s_trap 2
-; CHECK-NEXT:    ; divergent unreachable
-; CHECK-NEXT:    s_branch .LBB1_3
-; UNIFY-LABEL: @kernel_callbr(
-; UNIFY-NEXT:  entry:
-; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
-; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32
-; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]])
-; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else]
-; UNIFY:       if.then:
-; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
-; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
-; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]])
-; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
-; UNIFY:       cond.false:
-; UNIFY-NEXT:    call void @llvm.trap()
-; UNIFY-NEXT:    unreachable
-; UNIFY:       if.else:
-; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
-; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
-; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]])
-; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6]
-; UNIFY:       if.then3:
-; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
-; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
-; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
-; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
-; UNIFY:       cond.false.i8:
-; UNIFY-NEXT:    call void @llvm.trap()
-; UNIFY-NEXT:    unreachable
-; UNIFY:       if.end6.sink.split:
-; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
-; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
-; UNIFY-NEXT:    callbr void asm "", ""()
-; UNIFY-NEXT:            to label [[IF_END6:%.*]] []
-; UNIFY:       if.end6:
-; UNIFY-NEXT:    ret void
-;
-entry:
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %cmp = icmp eq i32 %n, 256
-  %cmp32 = zext i1 %cmp to i32
-  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
-
-if.then:
-  %cmp1 = icmp eq i32 %a, 0
-  %cmp1_32 = zext i1 %cmp1 to i32
-  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
-
-cond.false:
-  call void @llvm.trap()
-  unreachable
-
-if.else:
-  %cmp2 = icmp ult i32 %tid, 10
-  %cmp2_32 = zext i1 %cmp2 to i32
-  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
-
-if.then3:
-  %cmp1.i7 = icmp eq i32 %a, 0
-  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
-  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
-
-cond.false.i8:
-  call void @llvm.trap()
-  unreachable
-
-if.end6.sink.split:
-  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
-  store i32 %a, ptr addrspace(1) %x1, align 4
-  callbr void asm "", ""() to label %if.end6 []
-
-if.end6:
-  ret void
-}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; UNIFY: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 684dc1a1f0092..50666bee325e8 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,42 +37,3 @@ n28:                                               ; preds = %.loopexit, %n28
 n31:                                               ; preds =
   ret void
 }
-
-define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
-; IR-LABEL: @_amdgpu_ps_main_callbr(
-; IR-NEXT:  .entry:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] []
-; IR:       .loopexit:
-; IR-NEXT:    callbr void asm "", ""()
-; IR-NEXT:            to label [[N28:%.*]] []
-; IR:       n28:
-; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
-; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
-; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
-; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
-; IR:       TransitionBlock:
-; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]])
-; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28]
-; IR:       n31:
-; IR-NEXT:    ret void
-; IR:       DummyReturnBlock:
-; IR-NEXT:    ret void
-;
-.entry:
-  callbr void asm "", ""() to label %.loopexit []
-
-.loopexit:                                        ; preds = %n28, %.entry
-  callbr void asm "", ""() to label %n28 []
-
-n28:                                               ; preds = %.loopexit, %n28
-  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
-  %n29 = fadd float %.01, 1.0
-  %n30 = fcmp ogt float %n29, 4.000000e+00
-  %n30.32 = zext i1 %n30 to i32
-  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
-
-n31:                                               ; preds =
-  ret void
-}
diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll
deleted file mode 100644
index 42f95194980d4..0000000000000
--- a/llvm/test/Transforms/StructurizeCFG/callbr.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
-
-; Structurize as usual, but don't tear callbr and its destination blocks apart.
-;
-; Note: currently, callbr blocks and their corresponding target blocks
-; themselves are not handled by the structurizer.* If the CFG turns out to be
-; unstructured at the end, the CFG lowering (si-annotate-control-flow) will
-; detect this. For the currently intended use cases of callbr in the context of
-; the AMDGPU backend, this is not a limitation (cf.
-; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087).
-;
-; Note 2: while callbr and its targets remain untouched, everything else is
-; handled as usual, even if it is nested in a callbr region.
-;
-; *FIXME: this will be fixed in the future. Callbr can be handled as follows:
-; Input IR:
-; ```
-; define void @foo_callbr() {
-;   callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...]
-; fallthrough:
-;   br label %exit
-; indirect:
-;   br label %exit
-; ...
-; exit:
-;   ret void
-; }
-; ```
-;
-; Output IR:
-; ```
-; define void @foo_callbr() {
-;   callbr void asm "", "!i"()
-;          to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...]
-; fake.indirect:                                    ; preds = %0
-;   br label %Flow
-; fake.indirect1:                                   ; preds = %0
-;   br label %Flow
-; fake.indirect2:                                   ; preds = %0
-;   br label %Flow
-; ...
-; Flow:                                             ; preds = %fallthrough, %fake.indirect[0-N]
-;   %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ]
-;   br i1 %1, label %indirect, label %Flow1
-; Flow1:                                            ; preds = %Flow, %indirect
-;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ]
-;   br i1 %2, label %indirect1, label %Flow2
-; Flow2:                                            ; preds = %Flow, %indirect1
-;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ]
-;   br i1 %2, label %indirect2, label %Flow3
-; ...
-; fallthrough:                                      ; preds = %0
-;   br label %Flow
-; indirect:                                         ; preds = %Flow
-;   br label %Flow1
-; indirect1:                                        ; preds = %Flow1
-;   br label %Flow2
-; indirect2:                                        : preds = %Flow2
-;   br label %Flow3
-; ...
-; exit:                                             ; preds = %indirectN, %FlowN
-;   ret void
-; }
-; ```
-;
-; Output IR as ASCII-art:
-;          %0
-; ---------------------
-; |     |     |     |
-; v     v     v     v
-; f    f.i   f.i1  f.i2
-; |     |     |     |
-; v     v     v     v
-; ---------------------
-;        %Flow
-;          |   \
-;          |    %indirect
-;          |   /
-;       %Flow1
-;          |   \
-;          |    %indirect1
-;          |   /
-;       %Flow2
-;          |   \
-;          |    %indirect2
-;          |   /
-;        %exit
-;
-
-; Only callbr, nothing to do.
-define void @callbr_simple() {
-; CHECK-LABEL: define void @callbr_simple() {
-; CHECK-NEXT:  [[CALLBR:.*:]]
-; CHECK-NEXT:    callbr void asm "", "!i"()
-; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
-; CHECK:       [[INDIRECT]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[INDIRECT1:.*:]]
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-callbr:
-  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
-fallthrough:
-  br label %exit
-indirect:
-  br label %exit
-exit:
-  ret void
-}
-
-; Callbr nested in non-callbr: non-callbr is transformed
-define void @callbr_in_non_callbr(i1 %c) {
-; CHECK-LABEL: define void @callbr_in_non_callbr(
-; CHECK-SAME: i1 [[C:%.*]]) {
-; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
-; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]]
-; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]]
-; CHECK:       [[CALLBR]]:
-; CHECK-NEXT:    callbr void asm "", "!i"()
-; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
-; CHECK:       [[INDIRECT]]:
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[INDIRECT1:.*:]]
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[NOCALLBR]]:
-; CHECK-NEXT:    br label %[[FLOW]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-  br i1 %c, label %callbr, label %nocallbr
-callbr:
-  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
-fallthrough:
-  br label %exit
-indirect:
-  br label %exit
-nocallbr:
-  br label %exit
-exit:
-  ret void
-}
-
-; Callbr parent of non-callbr: non-callbr is transformed
-define void @non_callbr_in_callbr(i1 %c) {
-; CHECK-LABEL: define void @non_callbr_in_callbr(
-; CHECK-SAME: i1 [[C:%.*]]) {
-; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
-; CHECK-NEXT:    callbr void asm "", "!i"()
-; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
-; CHECK:       [[INDIRECT]]:
-; CHECK-NEXT:    br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]]
-; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]]
-; CHECK:       [[FALLTHROUGH1]]:
-; CHECK-NEXT:    br label %[[FLOW1]]
-; CHECK:       [[FALLTHROUGH2]]:
-; CHECK-NEXT:    br label %[[FLOW]]
-; CHECK:       [[INDIRECT1:.*:]]
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    br label %[[EXIT]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
-;
-  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
-fallthrough:
-  br i1 %c, label %fallthrough1, label %fallthrough2
-fallthrough1:
-  br label %exit
-fallthrough2:
-  br label %exit
-indirect:
-  br label %exit
-exit:
-  ret void
-}
-
-; Callbr surrounded by non-callbr: all three regular branches are handled
-; correctly
-define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) {
-; CHECK-LABEL: define void @callbr_nested_in_non_callbr(
-; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) {
-; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
-; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]]
-; CHECK:       [[FLOW3]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]]
-; CHECK:       [[CALLBR]]:
-; CHECK-NEXT:    callbr void asm "", "!i"()
-; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
-; CHECK:       [[INDIRECT]]:
-; CHECK-NEXT:    br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]]
-; CHECK:       [[FALLTHROUGH1]]:
-; CHECK-NEXT:    br label %[[FLOW2]]
-; CHECK:       [[INDIRECT2:.*:]]
-; CHECK-NEXT:    br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]]
-; CHECK:       [[INDIRECT1]]:
-; CHECK-NEXT:    br label %[[FLOW1]]
-; CHECK:       [[NOCALLBR]]:
-; CHECK-NEXT:    br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]]
-; CHECK:       [[NOCALLBR1]]:
-; CHECK-NEXT:    br label %[[FLOW]]
-; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    br label %[[FLOW3]]
-; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    br label %[[RET]]
-; CHECK:       [[FLOW2]]:
-; CHECK-NEXT:    br label %[[RET]]
-; CHECK:       [[RET]]:
-; CHECK-NEXT:    ret void
-;
-  br i1 %c, label %callbr, label %nocallbr
-callbr:
-  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
-fallthrough:
-  br i1 %d, label %fallthrough1, label %ret
-fallthrough1:
-  br label %ret
-indirect:
-  br i1 %e, label %indirect1, label %ret
-indirect1:
-  br label %ret
-nocallbr:
-  br i1 %f, label %nocallbr1, label %ret
-nocallbr1:
-  br label %ret
-ret:
-  ret void
-}

From 25ed9231159f4b2d82b0cf0eb36db65c7599df45 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 3 Nov 2025 11:40:18 -0500
Subject: [PATCH 030/313] [ADT] Handle and document multiple matches in
 StringSwitch (#166177)

Specify that the first match is returned and bail out early when
processing multiple case values.
---
 llvm/include/llvm/ADT/StringSwitch.h    | 31 ++++++++++++++++---------
 llvm/unittests/ADT/StringSwitchTest.cpp | 17 ++++++++++++++
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 5bdbb302a6d75..53ebec1eb3a54 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -41,6 +41,8 @@ namespace llvm {
 ///   .Cases({"violet", "purple"}, Violet)
 ///   .Default(UnknownColor);
 /// \endcode
+///
+/// When multiple matches are found, the value of the first match is returned.
 template<typename T, typename R = T>
 class StringSwitch {
   /// The string we are matching.
@@ -213,23 +215,30 @@ class StringSwitch {
   [[nodiscard]] operator R() { return DefaultUnreachable(); }
 
 private:
-  // Returns true when `Str` matches the `S` argument, and stores the result.
+  // Returns true when a match is found. If `Str` matches the `S` argument,
+  // stores the result.
   bool CaseImpl(StringLiteral S, T &Value) {
-    if (!Result && Str == S) {
-      Result = std::move(Value);
+    if (Result)
       return true;
-    }
-    return false;
+
+    if (Str != S)
+      return false;
+
+    Result = std::move(Value);
+    return true;
   }
 
-  // Returns true when `Str` matches the `S` argument (case-insensitive), and
-  // stores the result.
+  // Returns true when a match is found. If `Str` matches the `S` argument
+  // (case-insensitive), stores the result.
   bool CaseLowerImpl(StringLiteral S, T &Value) {
-    if (!Result && Str.equals_insensitive(S)) {
-      Result = std::move(Value);
+    if (Result)
       return true;
-    }
-    return false;
+
+    if (!Str.equals_insensitive(S))
+      return false;
+
+    Result = std::move(Value);
+    return true;
   }
 
   StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases,
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index c94feb54d0b7d..75d50f4dd1b5b 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -240,6 +240,23 @@ TEST(StringSwitchTest, CasesCopies) {
   EXPECT_EQ(NumCopies, 1u);
 }
 
+TEST(StringSwitchTest, StringSwitchMultipleMatches) {
+  auto Translate = [](StringRef S) {
+    return llvm::StringSwitch<int>(S)
+        .CaseLower("A", 0)
+        .Case("b", 1)
+        .Case("a", 2)
+        .CasesLower({"a", "b"}, 3)
+        .DefaultUnreachable();
+  };
+
+  // Check that the value of the first match is returned.
+  EXPECT_EQ(0, Translate("A"));
+  EXPECT_EQ(0, Translate("a"));
+  EXPECT_EQ(3, Translate("B"));
+  EXPECT_EQ(1, Translate("b"));
+}
+
 TEST(StringSwitchTest, DefaultUnreachable) {
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<int>(S)

From 96cd0dd335ad8b556235b249d2cdffeaf75892b8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 3 Nov 2025 08:41:09 -0800
Subject: [PATCH 031/313] [clang] Migrate away from a soft-deprecated
 constructor of APInt (NFC) (#166127)

We have:

/// Once all uses of this constructor are migrated to other
constructors,
/// consider marking this overload ""= delete" to prevent calls from
being
/// incorrectly bound to the APInt(unsigned, uint64_t, bool)
constructor.
LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t
bigVal[]);

This patch migrates away from this soft-deprecated constructor.
---
 clang/include/clang/AST/APNumericStorage.h    | 5 ++---
 clang/include/clang/AST/AbstractBasicReader.h | 2 +-
 clang/lib/AST/ByteCode/Floating.h             | 3 ++-
 clang/lib/AST/ByteCode/IntegralAP.h           | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/AST/APNumericStorage.h b/clang/include/clang/AST/APNumericStorage.h
index e1948a552bf7e..04424086b98cf 100644
--- a/clang/include/clang/AST/APNumericStorage.h
+++ b/clang/include/clang/AST/APNumericStorage.h
@@ -41,9 +41,8 @@ class APNumericStorage {
   llvm::APInt getIntValue() const {
     unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
     if (NumWords > 1)
-      return llvm::APInt(BitWidth, NumWords, pVal);
-    else
-      return llvm::APInt(BitWidth, VAL);
+      return llvm::APInt(BitWidth, llvm::ArrayRef(pVal, NumWords));
+    return llvm::APInt(BitWidth, VAL);
   }
   void setIntValue(const ASTContext &C, const llvm::APInt &Val);
 };
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 0d187eb49d6ca..064a342aa0684 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -173,7 +173,7 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> {
     llvm::SmallVector<uint64_t, 4> data;
     for (uint32_t i = 0; i != numWords; ++i)
       data.push_back(asImpl().readUInt64());
-    return llvm::APInt(bitWidth, numWords, &data[0]);
+    return llvm::APInt(bitWidth, data);
   }
 
   llvm::FixedPointSemantics readFixedPointSemantics() {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 659892e720abf..cc918dc12deb6 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -45,7 +45,8 @@ class Floating final {
     if (singleWord())
       return APFloat(getSemantics(), APInt(BitWidth, Val));
     unsigned NumWords = numWords();
-    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+    return APFloat(getSemantics(),
+                   APInt(BitWidth, llvm::ArrayRef(Memory, NumWords)));
   }
 
 public:
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 6683db941c736..b11e6eea28e3f 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -63,7 +63,7 @@ template <bool Signed> class IntegralAP final {
     if (singleWord())
       return APInt(BitWidth, Val, Signed);
     unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-    return llvm::APInt(BitWidth, NumWords, Memory);
+    return llvm::APInt(BitWidth, llvm::ArrayRef(Memory, NumWords));
   }
 
 public:

From 2458e15e5c819e90e308b28a8d96f23a3d685af1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 3 Nov 2025 08:41:17 -0800
Subject: [PATCH 032/313] [mlir] Migrate away from a soft-deprecated
 constructor of APInt (NFC) (#166128)

We have:

/// Once all uses of this constructor are migrated to other
constructors,
/// consider marking this overload ""= delete" to prevent calls from
being
/// incorrectly bound to the APInt(unsigned, uint64_t, bool)
constructor.
LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t
bigVal[]);

This patch migrates away from this soft-deprecated constructor.
---
 mlir/lib/AsmParser/Parser.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index 82bdb844480f1..74936e32bd9d9 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -407,8 +407,8 @@ Parser::parseFloatFromIntegerLiteral(std::optional<APFloat> &result,
                      "hexadecimal float constant out of range for type");
   }
 
-  APInt truncatedValue(typeSizeInBits, intValue.getNumWords(),
-                       intValue.getRawData());
+  APInt truncatedValue(typeSizeInBits,
+                       ArrayRef(intValue.getRawData(), intValue.getNumWords()));
   result.emplace(semantics, truncatedValue);
   return success();
 }

From 11c2923ccc6ec7f67049e9ea151467224814dbe5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 3 Nov 2025 08:41:25 -0800
Subject: [PATCH 033/313] [ADT] Use "using" instead of "typedef" (NFC)
 (#166129)

Identified with modernize-use-using.
---
 llvm/include/llvm/ADT/APFloat.h               |  6 +--
 llvm/include/llvm/ADT/APInt.h                 |  2 +-
 llvm/include/llvm/ADT/BitVector.h             |  6 +--
 llvm/include/llvm/ADT/GenericSSAContext.h     |  2 +-
 llvm/include/llvm/ADT/STLExtras.h             |  4 +-
 llvm/include/llvm/ADT/ilist.h                 | 26 +++++------
 llvm/include/llvm/ADT/ilist_node_options.h    | 43 ++++++++++---------
 llvm/unittests/ADT/APFloatTest.cpp            |  2 +-
 llvm/unittests/ADT/BitVectorTest.cpp          |  4 +-
 .../ADT/BreadthFirstIteratorTest.cpp          |  4 +-
 llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp  |  2 +-
 llvm/unittests/ADT/DenseMapTest.cpp           | 23 +++++-----
 llvm/unittests/ADT/DenseSetTest.cpp           | 14 +++---
 llvm/unittests/ADT/DepthFirstIteratorTest.cpp |  6 +--
 llvm/unittests/ADT/IListBaseTest.cpp          | 25 +++++------
 llvm/unittests/ADT/IListIteratorTest.cpp      |  8 ++--
 llvm/unittests/ADT/IListNodeBaseTest.cpp      |  8 ++--
 llvm/unittests/ADT/IListSentinelTest.cpp      | 15 +++----
 llvm/unittests/ADT/IntervalMapTest.cpp        | 10 ++---
 llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp |  6 +--
 llvm/unittests/ADT/IteratorTest.cpp           |  9 ++--
 llvm/unittests/ADT/PointerSumTypeTest.cpp     |  7 ++-
 llvm/unittests/ADT/PointerUnionTest.cpp       | 12 +++---
 llvm/unittests/ADT/PostOrderIteratorTest.cpp  |  2 +-
 llvm/unittests/ADT/PriorityWorklistTest.cpp   |  4 +-
 llvm/unittests/ADT/RangeAdapterTest.cpp       | 30 ++++++-------
 llvm/unittests/ADT/SCCIteratorTest.cpp        |  2 +-
 llvm/unittests/ADT/STLExtrasTest.cpp          |  8 ++--
 llvm/unittests/ADT/SimpleIListTest.cpp        |  8 ++--
 llvm/unittests/ADT/SmallPtrSetTest.cpp        |  2 +-
 llvm/unittests/ADT/SmallStringTest.cpp        |  2 +-
 llvm/unittests/ADT/SmallVectorTest.cpp        | 26 +++++------
 llvm/unittests/ADT/SparseMultiSetTest.cpp     |  4 +-
 llvm/unittests/ADT/SparseSetTest.cpp          |  4 +-
 llvm/unittests/ADT/TestGraph.h                |  8 ++--
 llvm/unittests/ADT/TinyPtrVectorTest.cpp      | 12 +++---
 36 files changed, 175 insertions(+), 181 deletions(-)

diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index bccdb8930561e..82ac9a3a1ef80 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -152,7 +152,7 @@ class APFloatBase {
   static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD;
 
   /// A signed type to represent a floating point numbers unbiased exponent.
-  typedef int32_t ExponentType;
+  using ExponentType = int32_t;
 
   /// \name Floating Point Semantics.
   /// @{
@@ -938,8 +938,8 @@ LLVM_ABI DoubleAPFloat frexp(const DoubleAPFloat &X, int &Exp, roundingMode);
 // This is a interface class that is currently forwarding functionalities from
 // detail::IEEEFloat.
 class APFloat : public APFloatBase {
-  typedef detail::IEEEFloat IEEEFloat;
-  typedef detail::DoubleAPFloat DoubleAPFloat;
+  using IEEEFloat = detail::IEEEFloat;
+  using DoubleAPFloat = detail::DoubleAPFloat;
 
   static_assert(std::is_standard_layout<IEEEFloat>::value);
 
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 9fa98ad4ddde1..26283d2437d48 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -77,7 +77,7 @@ inline APInt operator-(APInt);
 ///
 class [[nodiscard]] APInt {
 public:
-  typedef uint64_t WordType;
+  using WordType = uint64_t;
 
   /// Byte size of a word.
   static constexpr unsigned APINT_WORD_SIZE = sizeof(WordType);
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 9e81a4b735e7f..cc3f3a9226395 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -99,7 +99,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl {
 };
 
 class BitVector {
-  typedef uintptr_t BitWord;
+  using BitWord = uintptr_t;
 
   enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT };
 
@@ -147,8 +147,8 @@ class BitVector {
     }
   };
 
-  typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator;
-  typedef const_set_bits_iterator set_iterator;
+  using const_set_bits_iterator = const_set_bits_iterator_impl<BitVector>;
+  using set_iterator = const_set_bits_iterator;
 
   const_set_bits_iterator set_bits_begin() const {
     return const_set_bits_iterator(*this);
diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
index e9f99bafe9f1e..426a083778d6e 100644
--- a/llvm/include/llvm/ADT/GenericSSAContext.h
+++ b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -25,7 +25,7 @@ template <typename, bool> class DominatorTreeBase;
 template <typename> class SmallVectorImpl;
 
 namespace Intrinsic {
-typedef unsigned ID;
+using ID = unsigned;
 }
 
 // Specializations of this template should provide the types used by the
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index a9841c6651b72..8de8eb5b86640 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1516,8 +1516,8 @@ template <class Iterator, class RNG>
 void shuffle(Iterator first, Iterator last, RNG &&g) {
   // It would be better to use a std::uniform_int_distribution,
   // but that would be stdlib dependent.
-  typedef
-      typename std::iterator_traits<Iterator>::difference_type difference_type;
+  using difference_type =
+      typename std::iterator_traits<Iterator>::difference_type;
   for (auto size = last - first; size > 1; ++first, (void)--size) {
     difference_type offset = g() % size;
     // Avoid self-assignment due to incorrect assertions in libstdc++
diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h
index aed19ccbff7f2..64392903bec74 100644
--- a/llvm/include/llvm/ADT/ilist.h
+++ b/llvm/include/llvm/ADT/ilist.h
@@ -108,21 +108,21 @@ template <typename Ty> struct ilist_traits<const Ty> {};
 /// list.
 template <class IntrusiveListT, class TraitsT>
 class iplist_impl : public TraitsT, IntrusiveListT {
-  typedef IntrusiveListT base_list_type;
+  using base_list_type = IntrusiveListT;
 
 public:
-  typedef typename base_list_type::pointer pointer;
-  typedef typename base_list_type::const_pointer const_pointer;
-  typedef typename base_list_type::reference reference;
-  typedef typename base_list_type::const_reference const_reference;
-  typedef typename base_list_type::value_type value_type;
-  typedef typename base_list_type::size_type size_type;
-  typedef typename base_list_type::difference_type difference_type;
-  typedef typename base_list_type::iterator iterator;
-  typedef typename base_list_type::const_iterator const_iterator;
-  typedef typename base_list_type::reverse_iterator reverse_iterator;
-  typedef
-      typename base_list_type::const_reverse_iterator const_reverse_iterator;
+  using pointer = typename base_list_type::pointer;
+  using const_pointer = typename base_list_type::const_pointer;
+  using reference = typename base_list_type::reference;
+  using const_reference = typename base_list_type::const_reference;
+  using value_type = typename base_list_type::value_type;
+  using size_type = typename base_list_type::size_type;
+  using difference_type = typename base_list_type::difference_type;
+  using iterator = typename base_list_type::iterator;
+  using const_iterator = typename base_list_type::const_iterator;
+  using reverse_iterator = typename base_list_type::reverse_iterator;
+  using const_reverse_iterator =
+      typename base_list_type::const_reverse_iterator;
 
 private:
   static bool op_less(const_reference L, const_reference R) { return L < R; }
diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h
index 003d5dabce897..53719b07a3768 100644
--- a/llvm/include/llvm/ADT/ilist_node_options.h
+++ b/llvm/include/llvm/ADT/ilist_node_options.h
@@ -58,8 +58,8 @@ namespace ilist_detail {
 template <bool IsExplicit> struct explicitness {
   static const bool is_explicit = IsExplicit;
 };
-typedef explicitness<true> is_explicit;
-typedef explicitness<false> is_implicit;
+using is_explicit = explicitness<true>;
+using is_implicit = explicitness<false>;
 
 /// Check whether an option is valid.
 ///
@@ -103,12 +103,12 @@ struct is_valid_option<ilist_sentinel_tracking<EnableSentinelTracking>>
 template <class... Options> struct extract_tag;
 template <class Tag, class... Options>
 struct extract_tag<ilist_tag<Tag>, Options...> {
-  typedef Tag type;
+  using type = Tag;
 };
 template <class Option1, class... Options>
 struct extract_tag<Option1, Options...> : extract_tag<Options...> {};
 template <> struct extract_tag<> {
-  typedef void type;
+  using type = void;
 };
 template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {};
 
@@ -134,11 +134,13 @@ struct is_valid_option<ilist_iterator_bits<IteratorBits>> : std::true_type {};
 template <class... Options> struct extract_parent;
 template <class ParentTy, class... Options>
 struct extract_parent<ilist_parent<ParentTy>, Options...> {
-  typedef ParentTy type;
+  using type = ParentTy;
 };
 template <class Option1, class... Options>
 struct extract_parent<Option1, Options...> : extract_parent<Options...> {};
-template <> struct extract_parent<> { typedef void type; };
+template <> struct extract_parent<> {
+  using type = void;
+};
 template <class ParentTy>
 struct is_valid_option<ilist_parent<ParentTy>> : std::true_type {};
 
@@ -154,28 +156,27 @@ struct check_options : std::conjunction<is_valid_option<Options>...> {};
 template <class T, bool EnableSentinelTracking, bool IsSentinelTrackingExplicit,
           class TagT, bool HasIteratorBits, class ParentTy>
 struct node_options {
-  typedef T value_type;
-  typedef T *pointer;
-  typedef T &reference;
-  typedef const T *const_pointer;
-  typedef const T &const_reference;
+  using value_type = T;
+  using pointer = T *;
+  using reference = T &;
+  using const_pointer = const T *;
+  using const_reference = const T &;
 
   static const bool enable_sentinel_tracking = EnableSentinelTracking;
   static const bool is_sentinel_tracking_explicit = IsSentinelTrackingExplicit;
   static const bool has_iterator_bits = HasIteratorBits;
-  typedef TagT tag;
-  typedef ParentTy parent_ty;
-  typedef ilist_node_base<enable_sentinel_tracking, parent_ty> node_base_type;
-  typedef ilist_base<enable_sentinel_tracking, parent_ty> list_base_type;
+  using tag = TagT;
+  using parent_ty = ParentTy;
+  using node_base_type = ilist_node_base<enable_sentinel_tracking, parent_ty>;
+  using list_base_type = ilist_base<enable_sentinel_tracking, parent_ty>;
 };
 
 template <class T, class... Options> struct compute_node_options {
-  typedef node_options<T, extract_sentinel_tracking<Options...>::value,
-                       extract_sentinel_tracking<Options...>::is_explicit,
-                       typename extract_tag<Options...>::type,
-                       extract_iterator_bits<Options...>::value,
-                       typename extract_parent<Options...>::type>
-      type;
+  using type = node_options<T, extract_sentinel_tracking<Options...>::value,
+                            extract_sentinel_tracking<Options...>::is_explicit,
+                            typename extract_tag<Options...>::type,
+                            extract_iterator_bits<Options...>::value,
+                            typename extract_parent<Options...>::type>;
 };
 
 } // end namespace ilist_detail
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index fbe96bb127836..99cc38b6b422b 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -10118,7 +10118,7 @@ TEST(APFloatTest, Float4E2M1FNToFloat) {
 }
 
 TEST(APFloatTest, AddOrSubtractSignificand) {
-  typedef detail::IEEEFloatUnitTestHelper Helper;
+  using Helper = detail::IEEEFloatUnitTestHelper;
   // Test cases are all combinations of:
   // {equal exponents, LHS larger exponent, RHS larger exponent}
   // {equal significands, LHS larger significand, RHS larger significand}
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index 12ba0041af551..e13523b8e10c3 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -21,7 +21,7 @@ template <typename T>
 class BitVectorTest : public ::testing::Test { };
 
 // Test both BitVector and SmallBitVector with the same suite of tests.
-typedef ::testing::Types<BitVector, SmallBitVector> BitVectorTestTypes;
+using BitVectorTestTypes = ::testing::Types<BitVector, SmallBitVector>;
 TYPED_TEST_SUITE(BitVectorTest, BitVectorTestTypes, );
 
 TYPED_TEST(BitVectorTest, TrivialOperation) {
@@ -857,7 +857,7 @@ TYPED_TEST(BitVectorTest, BinOps) {
   EXPECT_FALSE(B.anyCommon(A));
 }
 
-typedef std::vector<std::pair<int, int>> RangeList;
+using RangeList = std::vector<std::pair<int, int>>;
 
 template <typename VecType>
 static inline VecType createBitVector(uint32_t Size,
diff --git a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
index a737390e79d8d..571e4d27c6752 100644
--- a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 namespace llvm {
 
 TEST(BreadthFristIteratorTest, Basic) {
-  typedef bf_iterator<Graph<4>> BFIter;
+  using BFIter = bf_iterator<Graph<4>>;
 
   Graph<4> G;
   G.AddEdge(0, 1);
@@ -46,7 +46,7 @@ TEST(BreadthFristIteratorTest, Basic) {
 }
 
 TEST(BreadthFristIteratorTest, Cycle) {
-  typedef bf_iterator<Graph<4>> BFIter;
+  using BFIter = bf_iterator<Graph<4>>;
 
   Graph<4> G;
   G.AddEdge(0, 1);
diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
index f543947899393..918a2e63da935 100644
--- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
 
 namespace {
 
-typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+using edge_ty = DAGDeltaAlgorithm::edge_ty;
 
 class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm {
   changeset_ty FailingSet;
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index aceb4f30d878d..273ee09fc1e28 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -129,18 +129,17 @@ typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr;
 
 // Register these types for testing.
 // clang-format off
-typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
-                         DenseMap<uint32_t *, uint32_t *>,
-                         DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
-                         DenseMap<EnumClass, uint32_t>,
-                         DenseMap<std::optional<uint32_t>, uint32_t>,
-                         SmallDenseMap<uint32_t, uint32_t>,
-                         SmallDenseMap<uint32_t *, uint32_t *>,
-                         SmallDenseMap<CtorTester, CtorTester, 4,
-                                       CtorTesterMapInfo>,
-                         SmallDenseMap<EnumClass, uint32_t>,
-                         SmallDenseMap<std::optional<uint32_t>, uint32_t>
-                         > DenseMapTestTypes;
+using DenseMapTestTypes = ::testing::Types<
+    DenseMap<uint32_t, uint32_t>,
+    DenseMap<uint32_t *, uint32_t *>,
+    DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
+    DenseMap<EnumClass, uint32_t>,
+    DenseMap<std::optional<uint32_t>, uint32_t>,
+    SmallDenseMap<uint32_t, uint32_t>,
+    SmallDenseMap<uint32_t *, uint32_t *>,
+    SmallDenseMap<CtorTester, CtorTester, 4, CtorTesterMapInfo>,
+    SmallDenseMap<EnumClass, uint32_t>,
+    SmallDenseMap<std::optional<uint32_t>, uint32_t>>;
 // clang-format on
 
 TYPED_TEST_SUITE(DenseMapTest, DenseMapTestTypes, );
diff --git a/llvm/unittests/ADT/DenseSetTest.cpp b/llvm/unittests/ADT/DenseSetTest.cpp
index a24f99b6bb34f..a2a062b151b67 100644
--- a/llvm/unittests/ADT/DenseSetTest.cpp
+++ b/llvm/unittests/ADT/DenseSetTest.cpp
@@ -96,13 +96,13 @@ template <typename T> class DenseSetTest : public testing::Test {
 };
 
 // Register these types for testing.
-typedef ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>,
-                         const DenseSet<unsigned, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 1, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
-                         const SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 64, TestDenseSetInfo>>
-    DenseSetTestTypes;
+using DenseSetTestTypes =
+    ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>,
+                     const DenseSet<unsigned, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 1, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
+                     const SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 64, TestDenseSetInfo>>;
 TYPED_TEST_SUITE(DenseSetTest, DenseSetTestTypes, );
 
 TYPED_TEST(DenseSetTest, Constructor) {
diff --git a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
index f792878004e7a..00312ca6044e6 100644
--- a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 namespace llvm {
 
 template <typename T> struct CountedSet {
-  typedef typename SmallPtrSet<T, 4>::iterator iterator;
+  using iterator = typename SmallPtrSet<T, 4>::iterator;
 
   SmallPtrSet<T, 4> S;
   int InsertVisited = 0;
@@ -44,8 +44,8 @@ template <typename T> class df_iterator_storage<CountedSet<T>, true> {
 };
 
 TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) {
-  typedef CountedSet<Graph<3>::NodeType *> StorageT;
-  typedef df_iterator<Graph<3>, StorageT, true> DFIter;
+  using StorageT = CountedSet<Graph<3>::NodeType *>;
+  using DFIter = df_iterator<Graph<3>, StorageT, true>;
 
   Graph<3> G;
   G.AddEdge(0, 1);
diff --git a/llvm/unittests/ADT/IListBaseTest.cpp b/llvm/unittests/ADT/IListBaseTest.cpp
index bd915688b190d..eeed488c28d88 100644
--- a/llvm/unittests/ADT/IListBaseTest.cpp
+++ b/llvm/unittests/ADT/IListBaseTest.cpp
@@ -19,13 +19,14 @@ template <typename T> class IListBaseTest : public ::testing::Test {};
 class Parent;
 
 // Test variants with the same test.
-typedef ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, ilist_base<false, Parent*>, ilist_base<true, Parent*>>
-    IListBaseTestTypes;
+using IListBaseTestTypes =
+    ::testing::Types<ilist_base<false, void>, ilist_base<true, void>,
+                     ilist_base<false, Parent *>, ilist_base<true, Parent *>>;
 TYPED_TEST_SUITE(IListBaseTest, IListBaseTestTypes, );
 
 TYPED_TEST(IListBaseTest, insertBeforeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -51,8 +52,8 @@ TYPED_TEST(IListBaseTest, insertBeforeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -80,8 +81,8 @@ TYPED_TEST(IListBaseTest, removeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeRangeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B, C, D;
 
@@ -106,8 +107,8 @@ TYPED_TEST(IListBaseTest, removeRangeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -126,8 +127,8 @@ TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) {
 }
 
 TYPED_TEST(IListBaseTest, transferBeforeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S1, S2, A, B, C, D, E;
 
diff --git a/llvm/unittests/ADT/IListIteratorTest.cpp b/llvm/unittests/ADT/IListIteratorTest.cpp
index 4e5b847b35ffe..54a4258246e9b 100644
--- a/llvm/unittests/ADT/IListIteratorTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorTest.cpp
@@ -141,10 +141,10 @@ TEST(IListIteratorTest, ReverseConstructor) {
   L.insert(L.end(), B);
 
   // Save typing.
-  typedef simple_ilist<Node>::iterator iterator;
-  typedef simple_ilist<Node>::reverse_iterator reverse_iterator;
-  typedef simple_ilist<Node>::const_iterator const_iterator;
-  typedef simple_ilist<Node>::const_reverse_iterator const_reverse_iterator;
+  using iterator = simple_ilist<Node>::iterator;
+  using reverse_iterator = simple_ilist<Node>::reverse_iterator;
+  using const_iterator = simple_ilist<Node>::const_iterator;
+  using const_reverse_iterator = simple_ilist<Node>::const_reverse_iterator;
 
   // Check conversion values.
   EXPECT_EQ(L.begin(), iterator(L.rend()));
diff --git a/llvm/unittests/ADT/IListNodeBaseTest.cpp b/llvm/unittests/ADT/IListNodeBaseTest.cpp
index ef90c716a4118..393f83af99b76 100644
--- a/llvm/unittests/ADT/IListNodeBaseTest.cpp
+++ b/llvm/unittests/ADT/IListNodeBaseTest.cpp
@@ -17,10 +17,10 @@ namespace {
 
 class Parent {};
 
-typedef ilist_node_base<false, void> RawNode;
-typedef ilist_node_base<true, void> TrackingNode;
-typedef ilist_node_base<false, Parent> ParentNode;
-typedef ilist_node_base<true, Parent> ParentTrackingNode;
+using RawNode = ilist_node_base<false, void>;
+using TrackingNode = ilist_node_base<true, void>;
+using ParentNode = ilist_node_base<false, Parent>;
+using ParentTrackingNode = ilist_node_base<true, Parent>;
 
 TEST(IListNodeBaseTest, DefaultConstructor) {
   RawNode A;
diff --git a/llvm/unittests/ADT/IListSentinelTest.cpp b/llvm/unittests/ADT/IListSentinelTest.cpp
index 1f4a8311370a6..709a1a4bb90e7 100644
--- a/llvm/unittests/ADT/IListSentinelTest.cpp
+++ b/llvm/unittests/ADT/IListSentinelTest.cpp
@@ -14,18 +14,17 @@ using namespace llvm;
 namespace {
 
 template <class T, class... Options> struct PickSentinel {
-  typedef ilist_sentinel<
-      typename ilist_detail::compute_node_options<T, Options...>::type>
-      type;
+  using type = ilist_sentinel<
+      typename ilist_detail::compute_node_options<T, Options...>::type>;
 };
 
 class Node : public ilist_node<Node> {};
 class TrackingNode : public ilist_node<Node, ilist_sentinel_tracking<true>> {};
-typedef PickSentinel<Node>::type Sentinel;
-typedef PickSentinel<Node, ilist_sentinel_tracking<true>>::type
-    TrackingSentinel;
-typedef PickSentinel<Node, ilist_sentinel_tracking<false>>::type
-    NoTrackingSentinel;
+using Sentinel = PickSentinel<Node>::type;
+using TrackingSentinel =
+    PickSentinel<Node, ilist_sentinel_tracking<true>>::type;
+using NoTrackingSentinel =
+    PickSentinel<Node, ilist_sentinel_tracking<false>>::type;
 
 struct LocalAccess : ilist_detail::NodeAccess {
   using NodeAccess::getPrev;
diff --git a/llvm/unittests/ADT/IntervalMapTest.cpp b/llvm/unittests/ADT/IntervalMapTest.cpp
index 99a93ab198d89..38f397ff2eb54 100644
--- a/llvm/unittests/ADT/IntervalMapTest.cpp
+++ b/llvm/unittests/ADT/IntervalMapTest.cpp
@@ -14,9 +14,9 @@ using namespace llvm;
 
 namespace {
 
-typedef IntervalMap<unsigned, unsigned, 4> UUMap;
-typedef IntervalMap<unsigned, unsigned, 4,
-                    IntervalMapHalfOpenInfo<unsigned>> UUHalfOpenMap;
+using UUMap = IntervalMap<unsigned, unsigned, 4>;
+using UUHalfOpenMap =
+    IntervalMap<unsigned, unsigned, 4, IntervalMapHalfOpenInfo<unsigned>>;
 
 // Empty map tests
 TEST(IntervalMapTest, EmptyMap) {
@@ -713,7 +713,7 @@ TEST(IntervalMapTest, OverlapsHalfOpen) {
 }
 
 TEST(IntervalMapOverlapsTest, SmallMaps) {
-  typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps;
+  using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>;
   UUMap::Allocator allocator;
   UUMap mapA(allocator);
   UUMap mapB(allocator);
@@ -757,7 +757,7 @@ TEST(IntervalMapOverlapsTest, SmallMaps) {
 }
 
 TEST(IntervalMapOverlapsTest, BigMaps) {
-  typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps;
+  using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>;
   UUMap::Allocator allocator;
   UUMap mapA(allocator);
   UUMap mapB(allocator);
diff --git a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
index f4f2083482804..6da42271764bc 100644
--- a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
+++ b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
@@ -25,9 +25,9 @@ struct SimpleRefCounted : Base<SimpleRefCounted<Base>> {
 
 template <typename T> struct IntrusiveRefCntPtrTest : testing::Test {};
 
-typedef ::testing::Types<SimpleRefCounted<RefCountedBase>,
-                         SimpleRefCounted<ThreadSafeRefCountedBase>>
-    IntrusiveRefCntTypes;
+using IntrusiveRefCntTypes =
+    ::testing::Types<SimpleRefCounted<RefCountedBase>,
+                     SimpleRefCounted<ThreadSafeRefCountedBase>>;
 TYPED_TEST_SUITE(IntrusiveRefCntPtrTest, IntrusiveRefCntTypes, );
 
 TYPED_TEST(IntrusiveRefCntPtrTest, RefCountedBaseCopyDoesNotLeak) {
diff --git a/llvm/unittests/ADT/IteratorTest.cpp b/llvm/unittests/ADT/IteratorTest.cpp
index b5d63efd8ccba..9dd8c1a84f44a 100644
--- a/llvm/unittests/ADT/IteratorTest.cpp
+++ b/llvm/unittests/ADT/IteratorTest.cpp
@@ -177,8 +177,8 @@ TEST(PointeeIteratorTest, Basic) {
   V.push_back(&arr[2]);
   V.push_back(&arr[3]);
 
-  typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator>
-      test_iterator;
+  using test_iterator =
+      pointee_iterator<SmallVectorImpl<int *>::const_iterator>;
 
   test_iterator Begin, End;
   Begin = V.begin();
@@ -218,9 +218,8 @@ TEST(PointeeIteratorTest, SmartPointer) {
   V.push_back(std::make_unique<int>(3));
   V.push_back(std::make_unique<int>(4));
 
-  typedef pointee_iterator<
-      SmallVectorImpl<std::unique_ptr<int>>::const_iterator>
-      test_iterator;
+  using test_iterator =
+      pointee_iterator<SmallVectorImpl<std::unique_ptr<int>>::const_iterator>;
 
   test_iterator Begin, End;
   Begin = V.begin();
diff --git a/llvm/unittests/ADT/PointerSumTypeTest.cpp b/llvm/unittests/ADT/PointerSumTypeTest.cpp
index fbf59f3a2fda5..11e657ad8bd25 100644
--- a/llvm/unittests/ADT/PointerSumTypeTest.cpp
+++ b/llvm/unittests/ADT/PointerSumTypeTest.cpp
@@ -17,10 +17,9 @@ struct PointerSumTypeTest : public testing::Test {
   float f;
   int i1, i2;
 
-  typedef PointerSumType<Kinds, PointerSumTypeMember<Float, float *>,
-                         PointerSumTypeMember<Int1, int *>,
-                         PointerSumTypeMember<Int2, int *>>
-      SumType;
+  using SumType = PointerSumType<Kinds, PointerSumTypeMember<Float, float *>,
+                                 PointerSumTypeMember<Int1, int *>,
+                                 PointerSumTypeMember<Int2, int *>>;
   SumType a, b, c, n;
 
   PointerSumTypeTest()
diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp
index acddb78960149..d8ac3aed76da2 100644
--- a/llvm/unittests/ADT/PointerUnionTest.cpp
+++ b/llvm/unittests/ADT/PointerUnionTest.cpp
@@ -12,9 +12,9 @@ using namespace llvm;
 
 namespace {
 
-typedef PointerUnion<int *, float *> PU;
-typedef PointerUnion<int *, float *, long long *> PU3;
-typedef PointerUnion<int *, float *, long long *, double *> PU4;
+using PU = PointerUnion<int *, float *>;
+using PU3 = PointerUnion<int *, float *, long long *>;
+using PU4 = PointerUnion<int *, float *, long long *, double *>;
 
 struct PointerUnionTest : public testing::Test {
   float f;
@@ -116,9 +116,9 @@ TEST_F(PointerUnionTest, Get) {
 
 template<int I> struct alignas(8) Aligned {};
 
-typedef PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *,
-                     Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>
-    PU8;
+using PU8 =
+    PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *,
+                 Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>;
 
 TEST_F(PointerUnionTest, ManyElements) {
   Aligned<0> a0;
diff --git a/llvm/unittests/ADT/PostOrderIteratorTest.cpp b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
index 838481f76ed7f..e875dd63a1958 100644
--- a/llvm/unittests/ADT/PostOrderIteratorTest.cpp
+++ b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
@@ -23,7 +23,7 @@ namespace {
 
 // Whether we're able to compile
 TEST(PostOrderIteratorTest, Compiles) {
-  typedef SmallPtrSet<void *, 4> ExtSetTy;
+  using ExtSetTy = SmallPtrSet<void *, 4>;
 
   // Tests that template specializations are kept up to date
   void *Null = nullptr;
diff --git a/llvm/unittests/ADT/PriorityWorklistTest.cpp b/llvm/unittests/ADT/PriorityWorklistTest.cpp
index f12d32ac9f496..08a47736c392e 100644
--- a/llvm/unittests/ADT/PriorityWorklistTest.cpp
+++ b/llvm/unittests/ADT/PriorityWorklistTest.cpp
@@ -20,8 +20,8 @@ namespace {
 using namespace llvm;
 
 template <typename T> class PriorityWorklistTest : public ::testing::Test {};
-typedef ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>
-    TestTypes;
+using TestTypes =
+    ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>;
 TYPED_TEST_SUITE(PriorityWorklistTest, TestTypes, );
 
 TYPED_TEST(PriorityWorklistTest, Basic) {
diff --git a/llvm/unittests/ADT/RangeAdapterTest.cpp b/llvm/unittests/ADT/RangeAdapterTest.cpp
index c1a8a984f233b..6849ccbc8052d 100644
--- a/llvm/unittests/ADT/RangeAdapterTest.cpp
+++ b/llvm/unittests/ADT/RangeAdapterTest.cpp
@@ -24,8 +24,8 @@ class ReverseOnlyVector {
 public:
   ReverseOnlyVector(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
-  typedef std::vector<int>::const_reverse_iterator const_reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
+  using const_reverse_iterator = std::vector<int>::const_reverse_iterator;
   reverse_iterator rbegin() { return Vec.rbegin(); }
   reverse_iterator rend() { return Vec.rend(); }
   const_reverse_iterator rbegin() const { return Vec.rbegin(); }
@@ -41,11 +41,11 @@ class BidirectionalVector {
 public:
   BidirectionalVector(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::iterator iterator;
+  using iterator = std::vector<int>::iterator;
   iterator begin() const;
   iterator end() const;
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
   reverse_iterator rbegin() const { return Vec.rbegin(); }
   reverse_iterator rend() const { return Vec.rend(); }
 };
@@ -58,15 +58,15 @@ class BidirectionalVectorConsts {
 public:
   BidirectionalVectorConsts(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::iterator iterator;
-  typedef std::vector<int>::const_iterator const_iterator;
+  using iterator = std::vector<int>::iterator;
+  using const_iterator = std::vector<int>::const_iterator;
   iterator begin();
   iterator end();
   const_iterator begin() const;
   const_iterator end() const;
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
-  typedef std::vector<int>::const_reverse_iterator const_reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
+  using const_reverse_iterator = std::vector<int>::const_reverse_iterator;
   reverse_iterator rbegin() { return Vec.rbegin(); }
   reverse_iterator rend() { return Vec.rend(); }
   const_reverse_iterator rbegin() const { return Vec.rbegin(); }
@@ -80,7 +80,7 @@ class CustomIteratorVector {
 public:
   CustomIteratorVector(std::initializer_list<int> list) : V(list) {}
 
-  typedef std::vector<int>::iterator iterator;
+  using iterator = std::vector<int>::iterator;
   class reverse_iterator {
     std::vector<int>::iterator I;
 
@@ -126,8 +126,8 @@ template <typename R> void TestRev(const R &r) {
 // Test fixture
 template <typename T> class RangeAdapterLValueTest : public ::testing::Test {};
 
-typedef ::testing::Types<std::vector<int>, std::list<int>, int[4]>
-    RangeAdapterLValueTestTypes;
+using RangeAdapterLValueTestTypes =
+    ::testing::Types<std::vector<int>, std::list<int>, int[4]>;
 TYPED_TEST_SUITE(RangeAdapterLValueTest, RangeAdapterLValueTestTypes, );
 
 TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) {
@@ -140,10 +140,10 @@ TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) {
 
 template <typename T> struct RangeAdapterRValueTest : testing::Test {};
 
-typedef ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector,
-                         ReverseOnlyVector, BidirectionalVector,
-                         BidirectionalVectorConsts>
-    RangeAdapterRValueTestTypes;
+using RangeAdapterRValueTestTypes =
+    ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector,
+                     ReverseOnlyVector, BidirectionalVector,
+                     BidirectionalVectorConsts>;
 TYPED_TEST_SUITE(RangeAdapterRValueTest, RangeAdapterRValueTestTypes, );
 
 TYPED_TEST(RangeAdapterRValueTest, TrivialOperation) {
diff --git a/llvm/unittests/ADT/SCCIteratorTest.cpp b/llvm/unittests/ADT/SCCIteratorTest.cpp
index 48350959d046b..5f088294b1a2d 100644
--- a/llvm/unittests/ADT/SCCIteratorTest.cpp
+++ b/llvm/unittests/ADT/SCCIteratorTest.cpp
@@ -21,7 +21,7 @@ TEST(SCCIteratorTest, AllSmallGraphs) {
   // create graphs for which every node has a self-edge.
 #define NUM_NODES 4
 #define NUM_GRAPHS (NUM_NODES * (NUM_NODES - 1))
-  typedef Graph<NUM_NODES> GT;
+  using GT = Graph<NUM_NODES>;
 
   /// Enumerate all graphs using NUM_GRAPHS bits.
   static_assert(NUM_GRAPHS < sizeof(unsigned) * CHAR_BIT, "Too many graphs!");
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 966b1f01e8a31..85567775e4ebd 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -60,7 +60,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
   // Test that a simple LValue can be enumerated and gives correct results with
   // multiple types, including the empty container.
   std::vector<char> foo = {'a', 'b', 'c'};
-  typedef std::pair<std::size_t, char> CharPairType;
+  using CharPairType = std::pair<std::size_t, char>;
   std::vector<CharPairType> CharResults;
 
   for (auto [index, value] : llvm::enumerate(foo)) {
@@ -72,7 +72,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
                           CharPairType(2u, 'c')));
 
   // Test a const range of a different type.
-  typedef std::pair<std::size_t, int> IntPairType;
+  using IntPairType = std::pair<std::size_t, int>;
   std::vector<IntPairType> IntResults;
   const std::vector<int> bar = {1, 2, 3};
   for (auto [index, value] : llvm::enumerate(bar)) {
@@ -111,7 +111,7 @@ TEST(STLExtrasTest, EnumerateModifyLValue) {
 
 TEST(STLExtrasTest, EnumerateRValueRef) {
   // Test that an rvalue can be enumerated.
-  typedef std::pair<std::size_t, int> PairType;
+  using PairType = std::pair<std::size_t, int>;
   std::vector<PairType> Results;
 
   auto Enumerator = llvm::enumerate(std::vector<int>{1, 2, 3});
@@ -138,7 +138,7 @@ TEST(STLExtrasTest, EnumerateModifyRValue) {
   // Test that when enumerating an rvalue, modification still works (even if
   // this isn't terribly useful, it at least shows that we haven't snuck an
   // extra const in there somewhere.
-  typedef std::pair<std::size_t, char> PairType;
+  using PairType = std::pair<std::size_t, char>;
   std::vector<PairType> Results;
 
   for (auto X : llvm::enumerate(std::vector<char>{'1', '2', '3'})) {
diff --git a/llvm/unittests/ADT/SimpleIListTest.cpp b/llvm/unittests/ADT/SimpleIListTest.cpp
index c2992baf8a5f7..cf3df8c293e25 100644
--- a/llvm/unittests/ADT/SimpleIListTest.cpp
+++ b/llvm/unittests/ADT/SimpleIListTest.cpp
@@ -605,8 +605,8 @@ struct Tag2 {};
 
 struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>,
                     ilist_node<DoubleNode, ilist_tag<Tag2>> {
-  typedef ilist_node<DoubleNode, ilist_tag<Tag1>> Node1Type;
-  typedef ilist_node<DoubleNode, ilist_tag<Tag2>> Node2Type;
+  using Node1Type = ilist_node<DoubleNode, ilist_tag<Tag1>>;
+  using Node2Type = ilist_node<DoubleNode, ilist_tag<Tag2>>;
 
   Node1Type::self_iterator getIterator1() { return Node1Type::getIterator(); }
   Node2Type::self_iterator getIterator2() { return Node2Type::getIterator(); }
@@ -617,8 +617,8 @@ struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>,
     return Node2Type::getIterator();
   }
 };
-typedef simple_ilist<DoubleNode, ilist_tag<Tag1>> TaggedList1Type;
-typedef simple_ilist<DoubleNode, ilist_tag<Tag2>> TaggedList2Type;
+using TaggedList1Type = simple_ilist<DoubleNode, ilist_tag<Tag1>>;
+using TaggedList2Type = simple_ilist<DoubleNode, ilist_tag<Tag2>>;
 
 TEST(SimpleIListTest, TaggedLists) {
   TaggedList1Type L1;
diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp
index a627091b90c70..fe7a8279d06b1 100644
--- a/llvm/unittests/ADT/SmallPtrSetTest.cpp
+++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp
@@ -57,7 +57,7 @@ TEST(SmallPtrSetTest, GrowthTest) {
 
 
   SmallPtrSet<int *, 4> s;
-  typedef SmallPtrSet<int *, 4>::iterator iter;
+  using iter = SmallPtrSet<int *, 4>::iterator;
 
   s.insert(&buf[0]);
   s.insert(&buf[1]);
diff --git a/llvm/unittests/ADT/SmallStringTest.cpp b/llvm/unittests/ADT/SmallStringTest.cpp
index 2f4df8afeafa5..db858246c9bbf 100644
--- a/llvm/unittests/ADT/SmallStringTest.cpp
+++ b/llvm/unittests/ADT/SmallStringTest.cpp
@@ -23,7 +23,7 @@ namespace {
 // Test fixture class
 class SmallStringTest : public testing::Test {
 protected:
-  typedef SmallString<40> StringType;
+  using StringType = SmallString<40>;
 
   StringType theString;
 
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 74fc737f29335..dbc626db54482 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -226,13 +226,10 @@ class SmallVectorTest : public SmallVectorTestBase {
   VectorT otherVector;
 };
 
-
-typedef ::testing::Types<SmallVector<Constructable, 0>,
-                         SmallVector<Constructable, 1>,
-                         SmallVector<Constructable, 2>,
-                         SmallVector<Constructable, 4>,
-                         SmallVector<Constructable, 5>
-                         > SmallVectorTestTypes;
+using SmallVectorTestTypes = ::testing::Types<
+    SmallVector<Constructable, 0>, SmallVector<Constructable, 1>,
+    SmallVector<Constructable, 2>, SmallVector<Constructable, 4>,
+    SmallVector<Constructable, 5>>;
 TYPED_TEST_SUITE(SmallVectorTest, SmallVectorTestTypes, );
 
 // Constructor test.
@@ -537,11 +534,11 @@ TYPED_TEST(SmallVectorTest, AppendNonIterTest) {
 }
 
 struct output_iterator {
-  typedef std::output_iterator_tag iterator_category;
-  typedef int value_type;
-  typedef int difference_type;
-  typedef value_type *pointer;
-  typedef value_type &reference;
+  using iterator_category = std::output_iterator_tag;
+  using value_type = int;
+  using difference_type = int;
+  using pointer = value_type *;
+  using reference = value_type &;
   operator int() { return 2; }
   operator Constructable() { return 7; }
 };
@@ -896,7 +893,7 @@ class DualSmallVectorsTest<std::pair<VectorT1, VectorT2>> : public SmallVectorTe
   VectorT2 otherVector;
 };
 
-typedef ::testing::Types<
+using DualSmallVectorTestTypes = ::testing::Types<
     // Small mode -> Small mode.
     std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 4>>,
     // Small mode -> Big mode.
@@ -904,8 +901,7 @@ typedef ::testing::Types<
     // Big mode -> Small mode.
     std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 4>>,
     // Big mode -> Big mode.
-    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>
-  > DualSmallVectorTestTypes;
+    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>>;
 
 TYPED_TEST_SUITE(DualSmallVectorsTest, DualSmallVectorTestTypes, );
 
diff --git a/llvm/unittests/ADT/SparseMultiSetTest.cpp b/llvm/unittests/ADT/SparseMultiSetTest.cpp
index 54f7bc99b52fa..91d37f4684b9e 100644
--- a/llvm/unittests/ADT/SparseMultiSetTest.cpp
+++ b/llvm/unittests/ADT/SparseMultiSetTest.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 namespace {
 
-typedef SparseMultiSet<unsigned> USet;
+using USet = SparseMultiSet<unsigned>;
 
 // Empty set tests.
 TEST(SparseMultiSetTest, EmptySet) {
@@ -211,7 +211,7 @@ struct Alt {
 };
 
 TEST(SparseMultiSetTest, AltStructSet) {
-  typedef SparseMultiSet<Alt> ASet;
+  using ASet = SparseMultiSet<Alt>;
   ASet Set;
   Set.setUniverse(10);
   Set.insert(Alt(1005));
diff --git a/llvm/unittests/ADT/SparseSetTest.cpp b/llvm/unittests/ADT/SparseSetTest.cpp
index 4fbf1caa247b7..f2b932907dc38 100644
--- a/llvm/unittests/ADT/SparseSetTest.cpp
+++ b/llvm/unittests/ADT/SparseSetTest.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 namespace {
 
-typedef SparseSet<unsigned> USet;
+using USet = SparseSet<unsigned>;
 
 // Empty set tests.
 TEST(SparseSetTest, EmptySet) {
@@ -166,7 +166,7 @@ struct Alt {
 };
 
 TEST(SparseSetTest, AltStructSet) {
-  typedef SparseSet<Alt> ASet;
+  using ASet = SparseSet<Alt>;
   ASet Set;
   Set.setUniverse(10);
   Set.insert(Alt(1005));
diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h
index a59ab504f7144..bb2ec47a0d5fe 100644
--- a/llvm/unittests/ADT/TestGraph.h
+++ b/llvm/unittests/ADT/TestGraph.h
@@ -34,7 +34,7 @@ class Graph {
 
   /// NodeSubset - A subset of the graph's nodes.
   class NodeSubset {
-    typedef unsigned char BitVector; // Where the limitation N <= 8 comes from.
+    using BitVector = unsigned char; // Where the limitation N <= 8 comes from.
     BitVector Elements;
     NodeSubset(BitVector e) : Elements(e) {}
   public:
@@ -96,7 +96,7 @@ class Graph {
   };
 
   /// NodeType - Node index and set of children of the node.
-  typedef std::pair<unsigned, NodeSubset> NodeType;
+  using NodeType = std::pair<unsigned, NodeSubset>;
 
 private:
   /// Nodes - The list of nodes for this graph.
@@ -233,8 +233,8 @@ class Graph {
 
 template <unsigned N>
 struct GraphTraits<Graph<N> > {
-  typedef typename Graph<N>::NodeType *NodeRef;
-  typedef typename Graph<N>::ChildIterator ChildIteratorType;
+  using NodeRef = typename Graph<N>::NodeType *;
+  using ChildIteratorType = typename Graph<N>::ChildIterator;
 
   static NodeRef getEntryNode(const Graph<N> &G) { return G.AccessNode(0); }
   static ChildIteratorType child_begin(NodeRef Node) {
diff --git a/llvm/unittests/ADT/TinyPtrVectorTest.cpp b/llvm/unittests/ADT/TinyPtrVectorTest.cpp
index af4ae4f4b0db9..c77721df5055c 100644
--- a/llvm/unittests/ADT/TinyPtrVectorTest.cpp
+++ b/llvm/unittests/ADT/TinyPtrVectorTest.cpp
@@ -28,14 +28,14 @@ template <typename PointerTy, unsigned IntBits, typename IntType,
           typename PtrTraits, typename Info>
 struct RemovePointer<
     PointerIntPair<PointerTy, IntBits, IntType, PtrTraits, Info>> {
-  typedef typename RemovePointer<PointerTy>::type type;
+  using type = typename RemovePointer<PointerTy>::type;
 };
 
 template <typename VectorT>
 class TinyPtrVectorTest : public testing::Test {
 protected:
-  typedef typename VectorT::value_type PtrT;
-  typedef typename RemovePointer<PtrT>::type ValueT;
+  using PtrT = typename VectorT::value_type;
+  using ValueT = typename RemovePointer<PtrT>::type;
   using PtrTraits = PointerLikeTypeTraits<PtrT>;
 
   VectorT V;
@@ -78,9 +78,9 @@ class TinyPtrVectorTest : public testing::Test {
   }
 };
 
-typedef ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>,
-                         TinyPtrVector<PointerIntPair<int *, 1>>>
-    TinyPtrVectorTestTypes;
+using TinyPtrVectorTestTypes =
+    ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>,
+                     TinyPtrVector<PointerIntPair<int *, 1>>>;
 TYPED_TEST_SUITE(TinyPtrVectorTest, TinyPtrVectorTestTypes, );
 
 TYPED_TEST(TinyPtrVectorTest, EmptyTest) {

From 5ed8f4847673f69dfe984a9ad653035a06e265bb Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 3 Nov 2025 08:41:32 -0800
Subject: [PATCH 034/313] [Support] Use "using" instead of "typedef" (NFC)
 (#166130)

Identified with modernize-use-using.
---
 llvm/include/llvm/Support/Allocator.h         |  2 +-
 llvm/include/llvm/Support/Atomic.h            |  4 +--
 llvm/include/llvm/Support/BinaryStreamArray.h |  8 ++---
 llvm/include/llvm/Support/Chrono.h            |  8 ++---
 llvm/include/llvm/Support/ConvertUTF.h        | 27 +++++++---------
 llvm/include/llvm/Support/DebugCounter.h      |  2 +-
 llvm/include/llvm/Support/ErrorHandling.h     |  4 +--
 .../llvm/Support/FormatVariadicDetails.h      |  4 +--
 llvm/include/llvm/Support/GenericLoopInfo.h   | 16 +++++-----
 .../llvm/Support/GenericLoopInfoImpl.h        |  6 ++--
 llvm/include/llvm/Support/MD5.h               |  2 +-
 llvm/include/llvm/Support/Mutex.h             |  4 +--
 llvm/include/llvm/Support/OnDiskHashTable.h   | 32 +++++++++----------
 .../llvm/Support/PointerLikeTypeTraits.h      |  4 +--
 llvm/include/llvm/Support/Program.h           |  4 +--
 llvm/include/llvm/Support/RISCVISAUtils.h     |  4 +--
 llvm/include/llvm/Support/RWMutex.h           |  6 ++--
 llvm/include/llvm/Support/Registry.h          |  4 +--
 llvm/include/llvm/Support/ScaledNumber.h      |  6 ++--
 llvm/include/llvm/Support/SuffixTree.h        |  2 +-
 llvm/include/llvm/Support/Threading.h         |  2 +-
 llvm/include/llvm/Support/TrailingObjects.h   |  6 ++--
 llvm/include/llvm/Support/UnicodeCharRanges.h |  2 +-
 llvm/include/llvm/Support/float128.h          |  2 +-
 llvm/include/llvm/Support/thread.h            |  2 +-
 llvm/lib/Support/BalancedPartitioning.cpp     |  2 +-
 llvm/lib/Support/CommandLine.cpp              |  8 ++---
 llvm/lib/Support/DAGDeltaAlgorithm.cpp        | 16 +++++-----
 llvm/lib/Support/DynamicLibrary.cpp           |  2 +-
 llvm/lib/Support/Timer.cpp                    |  2 +-
 30 files changed, 95 insertions(+), 98 deletions(-)

diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index bc0265904ef65..fffcbd9f3c1d8 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -380,7 +380,7 @@ class BumpPtrAllocatorImpl
 
 /// The standard BumpPtrAllocator which just uses the default template
 /// parameters.
-typedef BumpPtrAllocatorImpl<> BumpPtrAllocator;
+using BumpPtrAllocator = BumpPtrAllocatorImpl<>;
 
 /// A BumpPtrAllocator that allows only elements of a specific type to be
 /// allocated.
diff --git a/llvm/include/llvm/Support/Atomic.h b/llvm/include/llvm/Support/Atomic.h
index c2d9ae2da231c..3c62672a077f1 100644
--- a/llvm/include/llvm/Support/Atomic.h
+++ b/llvm/include/llvm/Support/Atomic.h
@@ -30,9 +30,9 @@ namespace llvm {
   LLVM_ABI void MemoryFence();
 
 #ifdef _MSC_VER
-  typedef long cas_flag;
+  using cas_flag = long;
 #else
-  typedef uint32_t cas_flag;
+  using cas_flag = uint32_t;
 #endif
   LLVM_ABI cas_flag CompareAndSwap(volatile cas_flag *ptr, cas_flag new_value,
                                    cas_flag old_value);
diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h
index ef2233c53ec2c..a7d03f6511f12 100644
--- a/llvm/include/llvm/Support/BinaryStreamArray.h
+++ b/llvm/include/llvm/Support/BinaryStreamArray.h
@@ -93,7 +93,7 @@ class VarStreamArray {
   friend class VarStreamArrayIterator<ValueType, Extractor>;
 
 public:
-  typedef VarStreamArrayIterator<ValueType, Extractor> Iterator;
+  using Iterator = VarStreamArrayIterator<ValueType, Extractor>;
 
   VarStreamArray() = default;
 
@@ -156,8 +156,8 @@ template <typename ValueType, typename Extractor>
 class VarStreamArrayIterator
     : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>,
                                   std::forward_iterator_tag, const ValueType> {
-  typedef VarStreamArrayIterator<ValueType, Extractor> IterType;
-  typedef VarStreamArray<ValueType, Extractor> ArrayType;
+  using IterType = VarStreamArrayIterator<ValueType, Extractor>;
+  using ArrayType = VarStreamArray<ValueType, Extractor>;
 
 public:
   VarStreamArrayIterator(const ArrayType &Array, const Extractor &E,
@@ -260,7 +260,7 @@ template <typename T> class FixedStreamArray {
   friend class FixedStreamArrayIterator<T>;
 
 public:
-  typedef FixedStreamArrayIterator<T> Iterator;
+  using Iterator = FixedStreamArrayIterator<T>;
 
   FixedStreamArray() = default;
   explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) {
diff --git a/llvm/include/llvm/Support/Chrono.h b/llvm/include/llvm/Support/Chrono.h
index 5b8102d8e11cf..e5f98249cc074 100644
--- a/llvm/include/llvm/Support/Chrono.h
+++ b/llvm/include/llvm/Support/Chrono.h
@@ -150,10 +150,10 @@ template <> struct unit<std::nano> {
 template <typename Rep, typename Period>
 struct format_provider<std::chrono::duration<Rep, Period>> {
 private:
-  typedef std::chrono::duration<Rep, Period> Dur;
-  typedef std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value,
-                             double, intmax_t>
-      InternalRep;
+  using Dur = std::chrono::duration<Rep, Period>;
+  using InternalRep =
+      std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value,
+                         double, intmax_t>;
 
   template <typename AsPeriod> static InternalRep getAs(const Dur &D) {
     using namespace std::chrono;
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index bb1723518a490..ddf7057bff59d 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -126,10 +126,10 @@ namespace llvm {
     bit mask & shift operations.
 ------------------------------------------------------------------------ */
 
-typedef unsigned int    UTF32;  /* at least 32 bits */
-typedef unsigned short  UTF16;  /* at least 16 bits */
-typedef unsigned char   UTF8;   /* typically 8 bits */
-typedef unsigned char   Boolean; /* 0 or 1 */
+using UTF32 = unsigned int;    /* at least 32 bits */
+using UTF16 = unsigned short;  /* at least 16 bits */
+using UTF8 = unsigned char;    /* typically 8 bits */
+using Boolean = unsigned char; /* 0 or 1 */
 
 /* Some fundamental constants */
 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
@@ -146,17 +146,14 @@ typedef unsigned char   Boolean; /* 0 or 1 */
 #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
 #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
 
-typedef enum {
-  conversionOK,           /* conversion successful */
-  sourceExhausted,        /* partial character in source, but hit end */
-  targetExhausted,        /* insuff. room in target for conversion */
-  sourceIllegal           /* source sequence is illegal/malformed */
-} ConversionResult;
-
-typedef enum {
-  strictConversion = 0,
-  lenientConversion
-} ConversionFlags;
+enum ConversionResult {
+  conversionOK,    /* conversion successful */
+  sourceExhausted, /* partial character in source, but hit end */
+  targetExhausted, /* insuff. room in target for conversion */
+  sourceIllegal    /* source sequence is illegal/malformed */
+};
+
+enum ConversionFlags { strictConversion = 0, lenientConversion };
 
 LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart,
                                              const UTF8 *sourceEnd,
diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 39a08d499b67e..9904a0dd86559 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -140,7 +140,7 @@ class DebugCounter {
   }
 
   // Iterate through the registered counters
-  typedef UniqueVector<std::string> CounterVector;
+  using CounterVector = UniqueVector<std::string>;
   CounterVector::const_iterator begin() const {
     return RegisteredCounters.begin();
   }
diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h
index 4c17b6e83acd2..a4fd008a9ff3f 100644
--- a/llvm/include/llvm/Support/ErrorHandling.h
+++ b/llvm/include/llvm/Support/ErrorHandling.h
@@ -21,8 +21,8 @@ class StringRef;
 class Twine;
 
 /// An error handler callback.
-typedef void (*fatal_error_handler_t)(void *user_data, const char *reason,
-                                      bool gen_crash_diag);
+using fatal_error_handler_t = void (*)(void *user_data, const char *reason,
+                                       bool gen_crash_diag);
 
 /// install_fatal_error_handler - Installs a new error handler to be used
 /// whenever a serious (non-recoverable) error is encountered by LLVM.
diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
index 0fdc7b6f94da7..c0b245e297a58 100644
--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
+++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
@@ -63,8 +63,8 @@ template <typename T> class missing_format_adapter;
 template <class T> class has_FormatProvider {
 public:
   using Decayed = std::decay_t<T>;
-  typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &,
-                                   StringRef);
+  using Signature_format = void (*)(const Decayed &, llvm::raw_ostream &,
+                                    StringRef);
 
   template <typename U> using check = SameType<Signature_format, &U::format>;
 
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index b6bb360d9868f..9e2f61fd03e78 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -150,9 +150,9 @@ template <class BlockT, class LoopT> class LoopBase {
     assert(!isInvalid() && "Loop not in a valid state!");
     return SubLoops;
   }
-  typedef typename std::vector<LoopT *>::const_iterator iterator;
-  typedef
-      typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator;
+  using iterator = typename std::vector<LoopT *>::const_iterator;
+  using reverse_iterator =
+      typename std::vector<LoopT *>::const_reverse_iterator;
   iterator begin() const { return getSubLoops().begin(); }
   iterator end() const { return getSubLoops().end(); }
   reverse_iterator rbegin() const { return getSubLoops().rbegin(); }
@@ -174,7 +174,7 @@ template <class BlockT, class LoopT> class LoopBase {
     assert(!isInvalid() && "Loop not in a valid state!");
     return Blocks;
   }
-  typedef typename ArrayRef<BlockT *>::const_iterator block_iterator;
+  using block_iterator = typename ArrayRef<BlockT *>::const_iterator;
   block_iterator block_begin() const { return getBlocks().begin(); }
   block_iterator block_end() const { return getBlocks().end(); }
   inline iterator_range<block_iterator> blocks() const {
@@ -302,7 +302,7 @@ template <class BlockT, class LoopT> class LoopBase {
   bool hasNoExitBlocks() const;
 
   /// Edge type.
-  typedef std::pair<BlockT *, BlockT *> Edge;
+  using Edge = std::pair<BlockT *, BlockT *>;
 
   /// Return all pairs of (_inside_block_,_outside_block_).
   void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const;
@@ -575,9 +575,9 @@ template <class BlockT, class LoopT> class LoopInfoBase {
   /// iterator/begin/end - The interface to the top-level loops in the current
   /// function.
   ///
-  typedef typename std::vector<LoopT *>::const_iterator iterator;
-  typedef
-      typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator;
+  using iterator = typename std::vector<LoopT *>::const_iterator;
+  using reverse_iterator =
+      typename std::vector<LoopT *>::const_reverse_iterator;
   iterator begin() const { return TopLevelLoops.begin(); }
   iterator end() const { return TopLevelLoops.end(); }
   reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); }
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 541678001a8ff..c830f0a67a448 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -459,7 +459,7 @@ template <class BlockT, class LoopT>
 static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
                                   LoopInfoBase<BlockT, LoopT> *LI,
                                   const DomTreeBase<BlockT> &DomTree) {
-  typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits;
+  using InvBlockTraits = GraphTraits<Inverse<BlockT *>>;
 
   unsigned NumBlocks = 0;
   unsigned NumSubloops = 0;
@@ -513,8 +513,8 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
 
 /// Populate all loop data in a stable order during a single forward DFS.
 template <class BlockT, class LoopT> class PopulateLoopsDFS {
-  typedef GraphTraits<BlockT *> BlockTraits;
-  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+  using BlockTraits = GraphTraits<BlockT *>;
+  using SuccIterTy = typename BlockTraits::ChildIteratorType;
 
   LoopInfoBase<BlockT, LoopT> *LI;
 
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index 4ba386753f397..dbcb66d7680e4 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -90,7 +90,7 @@ class MD5 {
 
 private:
   // Any 32-bit or wider unsigned integer data type will do.
-  typedef uint32_t MD5_u32plus;
+  using MD5_u32plus = uint32_t;
 
   // Internal State
   struct {
diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h
index d61e3fd96efbe..3ca5c9a2f6be8 100644
--- a/llvm/include/llvm/Support/Mutex.h
+++ b/llvm/include/llvm/Support/Mutex.h
@@ -63,12 +63,12 @@ namespace llvm
     };
 
     /// Mutex - A standard, always enforced mutex.
-    typedef SmartMutex<false> Mutex;
+    using Mutex = SmartMutex<false>;
 
     template <bool mt_only>
     using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>;
 
-    typedef SmartScopedLock<false> ScopedLock;
+    using ScopedLock = SmartScopedLock<false>;
   }
 }
 
diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h
index d7d72cfbbc649..54c6b713478b9 100644
--- a/llvm/include/llvm/Support/OnDiskHashTable.h
+++ b/llvm/include/llvm/Support/OnDiskHashTable.h
@@ -69,7 +69,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator {
         : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
   };
 
-  typedef typename Info::offset_type offset_type;
+  using offset_type = typename Info::offset_type;
   offset_type NumBuckets;
   offset_type NumEntries;
   llvm::SpecificBumpPtrAllocator<Item> BA;
@@ -278,12 +278,12 @@ template <typename Info> class OnDiskChainedHashTable {
   Info InfoObj;
 
 public:
-  typedef Info InfoType;
-  typedef typename Info::internal_key_type internal_key_type;
-  typedef typename Info::external_key_type external_key_type;
-  typedef typename Info::data_type data_type;
-  typedef typename Info::hash_value_type hash_value_type;
-  typedef typename Info::offset_type offset_type;
+  using InfoType = Info;
+  using internal_key_type = typename Info::internal_key_type;
+  using external_key_type = typename Info::external_key_type;
+  using data_type = typename Info::data_type;
+  using hash_value_type = typename Info::hash_value_type;
+  using offset_type = typename Info::offset_type;
 
   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
                          const unsigned char *Buckets,
@@ -435,12 +435,12 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
   const unsigned char *Payload;
 
 public:
-  typedef OnDiskChainedHashTable<Info>          base_type;
-  typedef typename base_type::internal_key_type internal_key_type;
-  typedef typename base_type::external_key_type external_key_type;
-  typedef typename base_type::data_type         data_type;
-  typedef typename base_type::hash_value_type   hash_value_type;
-  typedef typename base_type::offset_type       offset_type;
+  using base_type = OnDiskChainedHashTable<Info>;
+  using internal_key_type = typename base_type::internal_key_type;
+  using external_key_type = typename base_type::external_key_type;
+  using data_type = typename base_type::data_type;
+  using hash_value_type = typename base_type::hash_value_type;
+  using offset_type = typename base_type::offset_type;
 
 private:
   /// Iterates over all of the keys in the table.
@@ -450,7 +450,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     offset_type NumEntriesLeft;
 
   public:
-    typedef external_key_type value_type;
+    using value_type = external_key_type;
 
     iterator_base(const unsigned char *const Ptr, offset_type NumEntries)
         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {}
@@ -505,7 +505,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     Info *InfoObj;
 
   public:
-    typedef external_key_type value_type;
+    using value_type = external_key_type;
 
     key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
                  Info *InfoObj)
@@ -551,7 +551,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     Info *InfoObj;
 
   public:
-    typedef data_type value_type;
+    using value_type = data_type;
 
     data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
                   Info *InfoObj)
diff --git a/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/llvm/include/llvm/Support/PointerLikeTypeTraits.h
index 320f6b63b447e..a47d68406acf3 100644
--- a/llvm/include/llvm/Support/PointerLikeTypeTraits.h
+++ b/llvm/include/llvm/Support/PointerLikeTypeTraits.h
@@ -70,7 +70,7 @@ template <> struct PointerLikeTypeTraits<void *> {
 
 // Provide PointerLikeTypeTraits for const things.
 template <typename T> struct PointerLikeTypeTraits<const T> {
-  typedef PointerLikeTypeTraits<T> NonConst;
+  using NonConst = PointerLikeTypeTraits<T>;
 
   static inline const void *getAsVoidPointer(const T P) {
     return NonConst::getAsVoidPointer(P);
@@ -83,7 +83,7 @@ template <typename T> struct PointerLikeTypeTraits<const T> {
 
 // Provide PointerLikeTypeTraits for const pointers.
 template <typename T> struct PointerLikeTypeTraits<const T *> {
-  typedef PointerLikeTypeTraits<T *> NonConst;
+  using NonConst = PointerLikeTypeTraits<T *>;
 
   static inline const void *getAsVoidPointer(const T *P) {
     return NonConst::getAsVoidPointer(const_cast<T *>(P));
diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index 53c2e7597b2b4..575e416587ea8 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -39,8 +39,8 @@ const char EnvPathSeparator = ';';
 typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
 typedef void *process_t;        // Must match the type of HANDLE on Windows.
 #else
-typedef ::pid_t procid_t;
-typedef procid_t process_t;
+using procid_t = ::pid_t;
+using process_t = procid_t;
 #endif
 
 /// This struct encapsulates information about a process.
diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h
index 165bb08d66431..05fd32e0e7cfe 100644
--- a/llvm/include/llvm/Support/RISCVISAUtils.h
+++ b/llvm/include/llvm/Support/RISCVISAUtils.h
@@ -40,8 +40,8 @@ struct ExtensionComparator {
 
 /// OrderedExtensionMap is std::map, it's specialized to keep entries
 /// in canonical order of extension.
-typedef std::map<std::string, ExtensionVersion, ExtensionComparator>
-    OrderedExtensionMap;
+using OrderedExtensionMap =
+    std::map<std::string, ExtensionVersion, ExtensionComparator>;
 
 } // namespace RISCVISAUtils
 
diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 8d221aaab9ab9..efc1ca19a1208 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -162,7 +162,7 @@ template <bool mt_only> class SmartRWMutex {
   bool try_lock() { return impl.try_lock(); }
 };
 
-typedef SmartRWMutex<false> RWMutex;
+using RWMutex = SmartRWMutex<false>;
 
 /// ScopedReader - RAII acquisition of a reader lock
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
@@ -179,7 +179,7 @@ template <bool mt_only> struct SmartScopedReader {
   ~SmartScopedReader() { mutex.unlock_shared(); }
 };
 #endif
-typedef SmartScopedReader<false> ScopedReader;
+using ScopedReader = SmartScopedReader<false>;
 
 /// ScopedWriter - RAII acquisition of a writer lock
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
@@ -196,7 +196,7 @@ template <bool mt_only> struct SmartScopedWriter {
   ~SmartScopedWriter() { mutex.unlock(); }
 };
 #endif
-typedef SmartScopedWriter<false> ScopedWriter;
+using ScopedWriter = SmartScopedWriter<false>;
 
 } // end namespace sys
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index c02f15e5e32b8..acd3b06fde6e7 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -43,8 +43,8 @@ namespace llvm {
   template <typename T>
   class Registry {
   public:
-    typedef T type;
-    typedef SimpleRegistryEntry<T> entry;
+    using type = T;
+    using entry = SimpleRegistryEntry<T>;
 
     class node;
     class iterator;
diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h
index 07baf153e10c6..8ca8d457e339e 100644
--- a/llvm/include/llvm/Support/ScaledNumber.h
+++ b/llvm/include/llvm/Support/ScaledNumber.h
@@ -498,10 +498,10 @@ template <class DigitsT> class ScaledNumber : ScaledNumberBase {
   static_assert(!std::numeric_limits<DigitsT>::is_signed,
                 "only unsigned floats supported");
 
-  typedef DigitsT DigitsType;
+  using DigitsType = DigitsT;
 
 private:
-  typedef std::numeric_limits<DigitsType> DigitsLimits;
+  using DigitsLimits = std::numeric_limits<DigitsType>;
 
   static constexpr int Width = sizeof(DigitsType) * 8;
   static_assert(Width <= 64, "invalid integer width for digits");
@@ -782,7 +782,7 @@ uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const {
 template <class DigitsT>
 template <class IntT>
 IntT ScaledNumber<DigitsT>::toInt() const {
-  typedef std::numeric_limits<IntT> Limits;
+  using Limits = std::numeric_limits<IntT>;
   if (*this < 1)
     return 0;
   if (*this >= Limits::max())
diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index 4c78235abf508..eac66d84d6f63 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -219,7 +219,7 @@ class SuffixTree {
     }
   };
 
-  typedef RepeatedSubstringIterator iterator;
+  using iterator = RepeatedSubstringIterator;
   iterator begin() { return iterator(Root, LeafNodes); }
   iterator end() { return iterator(nullptr); }
 };
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index 88846807f111a..89d90b3438e92 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -53,7 +53,7 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
 
 #if LLVM_THREADING_USE_STD_CALL_ONCE
 
-  typedef std::once_flag once_flag;
+using once_flag = std::once_flag;
 
 #else
 
diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index c47976524dcd9..218c2e336d77b 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -76,7 +76,7 @@ class TrailingObjectsBase {
 // number of a different type. e.g.:
 //   ExtractSecondType<Foo..., int>::type
 template <typename Ty1, typename Ty2> struct ExtractSecondType {
-  typedef Ty2 type;
+  using type = Ty2;
 };
 
 // TrailingObjectsImpl is somewhat complicated, because it is a
@@ -101,8 +101,8 @@ class TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy, NextTy,
     : public TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy,
                                  MoreTys...> {
 
-  typedef TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>
-      ParentType;
+  using ParentType =
+      TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>;
 
   struct RequiresRealignment {
     static const bool value = alignof(PrevTy) < alignof(NextTy);
diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h
index 7f1a9b3ff0c3b..2b5fc83d34690 100644
--- a/llvm/include/llvm/Support/UnicodeCharRanges.h
+++ b/llvm/include/llvm/Support/UnicodeCharRanges.h
@@ -37,7 +37,7 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
 /// array.
 class UnicodeCharSet {
 public:
-  typedef ArrayRef<UnicodeCharRange> CharRanges;
+  using CharRanges = ArrayRef<UnicodeCharRange>;
 
   /// Constructs a UnicodeCharSet instance from an array of
   /// UnicodeCharRanges.
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
index e15a98dc5a677..ffad1241c3e3d 100644
--- a/llvm/include/llvm/Support/float128.h
+++ b/llvm/include/llvm/Support/float128.h
@@ -14,7 +14,7 @@ namespace llvm {
 #if defined(__clang__) && defined(__FLOAT128__) &&                             \
     defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
 #define HAS_IEE754_FLOAT128
-typedef __float128 float128;
+using float128 = __float128;
 #elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
     !defined(__LONG_DOUBLE_IBM128__) &&                                        \
     (defined(__GNUC__) || defined(__GNUG__))
diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h
index 16e322bfd8785..ecde62d8368e7 100644
--- a/llvm/include/llvm/Support/thread.h
+++ b/llvm/include/llvm/Support/thread.h
@@ -127,7 +127,7 @@ LLVM_ABI thread::id llvm_thread_get_current_id_impl();
 template <class Function, class... Args>
 thread::thread(std::optional<unsigned> StackSizeInBytes, Function &&f,
                Args &&...args) {
-  typedef std::tuple<std::decay_t<Function>, std::decay_t<Args>...> CalleeTuple;
+  using CalleeTuple = std::tuple<std::decay_t<Function>, std::decay_t<Args>...>;
   std::unique_ptr<CalleeTuple> Callee(
       new CalleeTuple(std::forward<Function>(f), std::forward<Args>(args)...));
 
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
index 1914f4cc39d96..d859abddbcad8 100644
--- a/llvm/lib/Support/BalancedPartitioning.cpp
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
   }
 
   // Compute move gains
-  typedef std::pair<float, BPFunctionNode *> GainPair;
+  using GainPair = std::pair<float, BPFunctionNode *>;
   std::vector<GainPair> Gains;
   for (auto &N : Nodes) {
     bool FromLeftToRight = (N.Bucket == LeftBucket);
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index de5bd795403dc..dab8beeff7ca5 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -2343,10 +2343,10 @@ namespace {
 class HelpPrinter {
 protected:
   const bool ShowHidden;
-  typedef SmallVector<std::pair<const char *, Option *>, 128>
-      StrOptionPairVector;
-  typedef SmallVector<std::pair<const char *, SubCommand *>, 128>
-      StrSubCommandPairVector;
+  using StrOptionPairVector =
+      SmallVector<std::pair<const char *, Option *>, 128>;
+  using StrSubCommandPairVector =
+      SmallVector<std::pair<const char *, SubCommand *>, 128>;
   // Print the options. Opts is assumed to be alphabetically sorted.
   virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) {
     for (const auto &Opt : Opts)
diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index 981536473d124..3bfae147d18c0 100644
--- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl {
   friend class DeltaActiveSetHelper;
 
 public:
-  typedef DAGDeltaAlgorithm::change_ty change_ty;
-  typedef DAGDeltaAlgorithm::changeset_ty changeset_ty;
-  typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty;
-  typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+  using change_ty = DAGDeltaAlgorithm::change_ty;
+  using changeset_ty = DAGDeltaAlgorithm::changeset_ty;
+  using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty;
+  using edge_ty = DAGDeltaAlgorithm::edge_ty;
 
 private:
-  typedef std::vector<change_ty>::iterator pred_iterator_ty;
-  typedef std::vector<change_ty>::iterator succ_iterator_ty;
-  typedef std::set<change_ty>::iterator pred_closure_iterator_ty;
-  typedef std::set<change_ty>::iterator succ_closure_iterator_ty;
+  using pred_iterator_ty = std::vector<change_ty>::iterator;
+  using succ_iterator_ty = std::vector<change_ty>::iterator;
+  using pred_closure_iterator_ty = std::set<change_ty>::iterator;
+  using succ_closure_iterator_ty = std::set<change_ty>::iterator;
 
   DAGDeltaAlgorithm &DDA;
 
diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp
index f1c15c00cedea..61566d3722419 100644
--- a/llvm/lib/Support/DynamicLibrary.cpp
+++ b/llvm/lib/Support/DynamicLibrary.cpp
@@ -23,7 +23,7 @@ using namespace llvm::sys;
 
 // All methods for HandleSet should be used holding SymbolsMutex.
 class DynamicLibrary::HandleSet {
-  typedef std::vector<void *> HandleList;
+  using HandleList = std::vector<void *>;
   HandleList Handles;
   void *Process = &Invalid;
 
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 9d45096dddd97..b08f5083e00a8 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const {
 
 namespace {
 
-typedef StringMap<Timer> Name2TimerMap;
+using Name2TimerMap = StringMap<Timer>;
 
 class Name2PairMap {
   StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map;

From bb14b831d20a5b22da017d94b046fa40316f8364 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 3 Nov 2025 08:41:41 -0800
Subject: [PATCH 035/313] [llvm] Proofread MIRLangRef.rst (#166131)

---
 llvm/docs/MIRLangRef.rst | 56 ++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index 3f4c3cde9b3aa..f7647c898c1e6 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -86,25 +86,25 @@ Tests are more accessible and future proof when simplified:
 - Use the ``-simplify-mir`` option with llc.
 
 - Machine function attributes often have default values or the test works just
-  as well with default values. Typical candidates for this are: `alignment:`,
-  `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`.
+  as well with default values. Typical candidates for this are: ``alignment:``,
+  ``exposesReturnsTwice``, ``legalized``, ``regBankSelected``, ``selected``.
   The whole `frameInfo` section is often unnecessary if there is no special
-  frame usage in the function. `tracksRegLiveness` on the other hand is often
+  frame usage in the function. ``tracksRegLiveness`` on the other hand is often
   necessary for some passes that care about block livein lists.
 
-- The (global) `liveins:` list is typically only interesting for early
+- The (global) ``liveins:`` list is typically only interesting for early
   instruction selection passes and can be removed when testing later passes.
-  The per-block `liveins:` on the other hand are necessary if
+  The per-block ``liveins:`` on the other hand are necessary if
   `tracksRegLiveness` is true.
 
-- Branch probability data in block `successors:` lists can be dropped if the
+- Branch probability data in block ``successors:`` lists can be dropped if the
   test doesn't depend on it. Example:
-  `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with
-  `successors: %bb.1, %bb.2`.
+  ``successors: %bb.1(0x40000000), %bb.2(0x40000000)`` can be replaced with
+  ``successors: %bb.1, %bb.2``.
 
 - MIR code contains a whole IR module. This is necessary because there are
   no equivalents in MIR for global variables, references to external functions,
-  function attributes, metadata, debug info. Instead some MIR data references
+  function attributes, metadata, debug info. Instead, some MIR data references
   the IR constructs. You can often remove them if the test doesn't depend on
   them.
 
@@ -114,16 +114,16 @@ Tests are more accessible and future proof when simplified:
   dropped: `:: (load 8)`
 
 - MIR blocks can reference IR blocks for debug printing, profile information,
-  or debug locations. Example: `bb.42.myblock` in MIR references the IR block
-  `myblock`. It is usually possible to drop the `.myblock` reference and simply
-  use `bb.42`.
+  or debug locations. Example: ``bb.42.myblock`` in MIR references the IR block
+  ``myblock``. It is usually possible to drop the ``.myblock`` reference and simply
+  use ``bb.42``.
 
 - If there are no memory operands or blocks referencing the IR, then the
   IR function can be replaced by a parameterless dummy function like
-  `define @func() { ret void }`.
+  ``define @func() { ret void }``.
 
 - It is possible to drop the whole IR section of the MIR file if it only
-  contains dummy functions (see above). The .mir loader will create the
+  contains dummy functions (see above). The ``.mir`` loader will create the
   IR functions automatically in this case.
 
 .. _limitations:
@@ -131,7 +131,7 @@ Tests are more accessible and future proof when simplified:
 Limitations
 -----------
 
-Currently the MIR format has several limitations in terms of which state it
+Currently, the MIR format has several limitations in terms of which state it
 can serialize:
 
 - The target-specific state in the target-specific ``MachineFunctionInfo``
@@ -150,7 +150,7 @@ These limitations impose restrictions on what you can test with the MIR format.
 For now, tests that would like to test some behaviour that depends on the state
 of temporary or local ``MCSymbol``  operands or the exception handling state in
 MMI, can't use the MIR format. As well as that, tests that test some behaviour
-that depends on the state of the target specific ``MachineFunctionInfo`` or
+that depends on the state of the target-specific ``MachineFunctionInfo`` or
 ``MachineConstantPoolValue`` subclasses can't use the MIR format at the moment.
 
 High Level Structure
@@ -286,7 +286,7 @@ Example:
 Successors
 ^^^^^^^^^^
 
-The machine basic block's successors have to be specified before any of the
+The machine basic block's successors must be specified before any of the
 instructions:
 
 .. code-block:: text
@@ -489,13 +489,13 @@ In case this is true, the Machine Operand is printed according to the target.
 
 For example:
 
-In AArch64RegisterInfo.td:
+In ``AArch64RegisterInfo.td``:
 
 .. code-block:: text
 
   def sub_32 : SubRegIndex<32>;
 
-If the third operand is an immediate with the value ``15`` (target-dependent
+If the third operand is an immediate with the value ``15`` (a target-dependent
 value), based on the instruction's opcode and the operand's index the operand
 will be printed as ``%subreg.sub_32``:
 
@@ -503,7 +503,7 @@ will be printed as ``%subreg.sub_32``:
 
     %1:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
 
-For integers > 64 bits, we use a special machine operand, ``MO_CImmediate``,
+For integers larger than 64 bits, we use a special machine operand, ``MO_CImmediate``,
 which stores the immediate in a ``ConstantInt`` using an ``APInt`` (LLVM's
 arbitrary-precision integers).
 
@@ -552,7 +552,7 @@ corresponding internal ``llvm::RegState`` representation:
 
    * - ``implicit``
      - ``RegState::Implicit``
-     - Not emitted register (e.g. carry, or temporary result).
+     - Not emitted register (e.g., carry, or temporary result).
 
    * - ``implicit-def``
      - ``RegState::ImplicitDefine``
@@ -625,7 +625,7 @@ For a CPI with the index 0 and offset -12:
 
     %1:gr64 = MOV64ri %const.0 - 12
 
-A constant pool entry is bound to a LLVM IR ``Constant`` or a target-specific
+A constant pool entry is bound to an LLVM IR ``Constant`` or a target-specific
 ``MachineConstantPoolValue``. When serializing all the function's constants, the
 following format is used:
 
@@ -670,12 +670,12 @@ a global value operand named ``G``:
 
     $rax = MOV64rm $rip, 1, _, @G, _
 
-The named global values are represented using an identifier with the '@' prefix.
+The named global values are represented using an identifier with the ``@`` prefix.
 If the identifier doesn't match the regular expression
-`[-a-zA-Z$._][-a-zA-Z$._0-9]*`, then this identifier must be quoted.
+``[-a-zA-Z$._][-a-zA-Z$._0-9]*``, then this identifier must be quoted.
 
 The unnamed global values are represented using an unsigned numeric value with
-the '@' prefix, like in the following examples: ``@0``, ``@989``.
+the ``@`` prefix, as in the following examples: ``@0``, ``@989``.
 
 Target-dependent Index Operands
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -741,7 +741,7 @@ Example:
 MCSymbol Operands
 ^^^^^^^^^^^^^^^^^
 
-A MCSymbol operand holds a pointer to a ``MCSymbol``. For the limitations
+An ``MCSymbol`` operand holds a pointer to an ``MCSymbol``. For the limitations
 of this operand in MIR, see :ref:`limitations <limitations>`.
 
 The syntax is:
@@ -825,7 +825,7 @@ Comments
 ^^^^^^^^
 
 Machine operands can have C/C++ style comments, which are annotations enclosed
-between ``/*`` and ``*/`` to improve readability of e.g. immediate operands.
+between ``/*`` and ``*/`` to improve readability of e.g., immediate operands.
 In the example below, ARM instructions EOR and BCC and immediate operands
 ``14`` and ``0`` have been annotated with their condition codes (CC)
 definitions, i.e. the ``always`` and ``eq`` condition codes:
@@ -920,7 +920,7 @@ Instruction referencing locations
 
 This experimental feature aims to separate the specification of variable
 *values* from the program point where a variable takes on that value. Changes
-in variable value occur in the same manner as ``DBG_VALUE`` meta instructions
+in a variable value occur in the same manner as ``DBG_VALUE`` meta instructions
 but using ``DBG_INSTR_REF``. Variable values are identified by a pair of
 instruction number and operand number. Consider the example below:
 

From 645a9ed7f26c7454b9973239a6b737ebdb42277b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 3 Nov 2025 09:03:54 -0800
Subject: [PATCH 036/313] [Github] Truncate Bug Emails when Necessary (#166081)

Mailgun limits the size of an email payload to 16k. Truncate the issue
body, which should be the largest part around 15k and point the user to
Github to see the rest.

Fixes #165020
---
 .github/workflows/llvm-bugs.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index 7d42abfadde7b..cd3f396e7c465 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -39,6 +39,12 @@ jobs:
               repo: context.repo.repo
             })
             .then((issue) => {
+              var maybeTruncatedBody = issue.data.body;
+              if (maybeTruncatedBody.length > 15000) {
+                maybeTruncatedBody = maybeTruncatedBody.substring(0,
+                    15000) +
+                  "<truncated>Please see the issue for the entire body."
+              }
               const payload = {
                 author : issue.data.user.login,
                 issue  : issue.data.number,

From 4cb8f97a0965f0a1138ab9f16aa00aff9b06c312 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 3 Nov 2025 09:04:04 -0800
Subject: [PATCH 037/313] [clang][RISCV] Add C intrinsics for XSfvfexp* and
 XSfvfexpa* (#165792)

Add C intrinsics for XSfvfexp16e/32e, XSfvfbfexp16e, and XSfvfexpa(64e)
introduced in #164349 and #164499

Co-authored-by: Jesse Huang <jesse.huang@sifive.com>
Co-authored-by: Craig Topper <craig.topper@sifive.com>
---
 .../clang/Basic/riscv_sifive_vector.td        |  27 +
 .../non-policy/non-overloaded/sf_vfexp_v_16.c | 131 +++++
 .../non-policy/non-overloaded/sf_vfexp_v_32.c | 111 ++++
 .../non-policy/non-overloaded/sf_vfexp_v_bf.c | 135 +++++
 .../non-policy/non-overloaded/sf_vfexpa_v.c   | 234 +++++++++
 .../non-overloaded/sf_vfexpa_v_64.c           |  90 ++++
 .../non-policy/overloaded/sf_vfexp_v_16.c     | 131 +++++
 .../non-policy/overloaded/sf_vfexp_v_32.c     | 111 ++++
 .../non-policy/overloaded/sf_vfexp_v_bf.c     | 134 +++++
 .../non-policy/overloaded/sf_vfexpa_v.c       | 234 +++++++++
 .../non-policy/overloaded/sf_vfexpa_v_64.c    |  90 ++++
 .../policy/non-overloaded/sf_vfexp_v_16.c     | 248 +++++++++
 .../policy/non-overloaded/sf_vfexp_v_32.c     | 208 ++++++++
 .../policy/non-overloaded/sf_vfexp_v_bf.c     | 248 +++++++++
 .../policy/non-overloaded/sf_vfexpa_v.c       | 448 ++++++++++++++++
 .../policy/non-overloaded/sf_vfexpa_v_64.c    | 167 ++++++
 .../policy/overloaded/sf_vfexp_v_16.c         | 261 ++++++++++
 .../policy/overloaded/sf_vfexp_v_32.c         | 228 ++++++++
 .../policy/overloaded/sf_vfexp_v_bf.c         | 272 ++++++++++
 .../policy/overloaded/sf_vfexpa_v.c           | 492 ++++++++++++++++++
 .../policy/overloaded/sf_vfexpa_v_64.c        | 183 +++++++
 21 files changed, 4183 insertions(+)
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
 create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c

diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index 89e644a078682..0371279aafc08 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -121,6 +121,13 @@ multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> {
     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
 }
 
+multiclass RVVVFEXPBuiltinSet<list<list<string>> suffixes_prototypes, string type_range> {
+  let UnMaskedPolicyScheme = HasPassthruOperand,
+      OverloadedName = NAME,
+      Log2LMUL = [-2, -1, 0, 1, 2, 3] in
+    defm NAME : RVVOutBuiltinSet<NAME, type_range, suffixes_prototypes>;
+}
+
 multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> {
   let Log2LMUL = [-3, -2, -1, 0, 1, 2],
       Name = NAME,
@@ -145,6 +152,26 @@ let UnMaskedPolicyScheme = HasPolicyOperand in
     defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
+let RequiredFeatures = ["xsfvfbfexp16e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "y">;
+}
+
+let RequiredFeatures = ["xsfvfexp16e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "x">;
+}
+
+let RequiredFeatures = ["xsfvfexp32e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "f">;
+}
+
+let RequiredFeatures = ["xsfvfexpa"] in {
+  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "xf">;
+}
+
+let RequiredFeatures = ["xsfvfexpa64e"] in {
+  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "d">;
+}
+
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["xsfvfwmaccqqq"] in
     defm sf_vfwmacc_4x4x4 : RVVVFWMACCBuiltinSet<[["", "Fw", "FwFwSvv"]]>;
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..a0d5845208529
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,131 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..25d0991fa70cd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,111 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..9fc332a1469ff
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_m(vm, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..67a9220bd011d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..fd6f82db52953
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..0e769ed5fc5bc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,131 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..3df1eaa3a0467
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,111 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..6179dbe8d82e4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..1ddbb0b84520c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..165879a8bb589
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..aed6d87a4b18a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..374f324cc0808
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..aec0b9f934ab9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..b6870264251cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
@@ -0,0 +1,448 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..8638dc232cf01
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,167 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..4ceeb7b35629c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,261 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
+                                         vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
+                                         vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
+                                       vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
+                                       vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
+                                       vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
+                                       vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
+                                          vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
+                                          vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
+                                        vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
+                                        vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
+                                        vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
+                                        vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
+                                        vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
+                                        vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
+                                      vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
+                                      vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
+                                      vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
+                                      vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..e08d6c5b371cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,228 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                         vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                       vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                       vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                       vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                       vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                          vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                        vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                        vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                        vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                        vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                        vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                      vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                      vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                      vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                      vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..14570d465bea8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2,
+                                          size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                         vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                         vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                          vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                        vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                        vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..4ac5cfc360551
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
@@ -0,0 +1,492 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
+                                          vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
+                                          vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
+                                        vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
+                                        vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
+                                        vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
+                                        vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                          vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                        vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                        vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                        vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                        vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
+                                           vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
+                                           vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
+                                         vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
+                                         vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
+                                         vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
+                                         vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                           vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                         vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                         vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                         vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                         vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
+                                         vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
+                                         vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
+                                       vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
+                                       vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
+                                       vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
+                                       vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                         vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                       vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                       vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                       vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                       vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..d0faaee571122
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,183 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd,
+                                        vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd,
+                                        vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd,
+                                        vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd,
+                                        vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd,
+                                         vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd,
+                                         vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd,
+                                         vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd,
+                                         vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd,
+                                       vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd,
+                                       vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd,
+                                       vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd,
+                                       vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}

From 5f0169939165c308f26585211860153aa8f0cef3 Mon Sep 17 00:00:00 2001
From: Mend Renovate <bot@renovateapp.com>
Date: Mon, 3 Nov 2025 17:05:45 +0000
Subject: [PATCH 038/313] Update [Github] Update GHA Dependencies (#166111)

This PR contains the following updates:

| Package | Type | Update | Change |
|---|---|---|---|
|
[EnricoMi/publish-unit-test-result-action](https://redirect.github.com/EnricoMi/publish-unit-test-result-action)
| action | minor | `v2.20.0` -> `v2.21.0` |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | patch | `v4.6.0` -> `v4.6.2` |
| [docker/login-action](https://redirect.github.com/docker/login-action)
| action | minor | `v3.5.0` -> `v3.6.0` |
|
[github/codeql-action](https://redirect.github.com/github/codeql-action)
| action | minor | `v3.30.4` -> `v3.31.2` |
| llvm/actions | action | digest | `a1ea791` -> `42d8057` |
|
[ossf/scorecard-action](https://redirect.github.com/ossf/scorecard-action)
| action | patch | `v2.4.2` -> `v2.4.3` |
| [python](https://redirect.github.com/actions/python-versions) |
uses-with | minor | `3.13` -> `3.14` |
---
 .github/workflows/build-ci-container-tooling.yml | 2 +-
 .github/workflows/check-ci.yml                   | 2 +-
 .github/workflows/docs.yml                       | 2 +-
 .github/workflows/gha-codeql.yml                 | 4 ++--
 .github/workflows/hlsl-test-all.yaml             | 2 +-
 .github/workflows/libcxx-build-containers.yml    | 2 +-
 .github/workflows/libcxx-run-benchmarks.yml      | 2 +-
 .github/workflows/release-binaries.yml           | 4 ++--
 .github/workflows/scorecard.yml                  | 2 +-
 9 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml
index c77c78617666d..992947eb2fffb 100644
--- a/.github/workflows/build-ci-container-tooling.yml
+++ b/.github/workflows/build-ci-container-tooling.yml
@@ -63,7 +63,7 @@ jobs:
           podman save ${{ steps.vars.outputs.container-name-lint-tag }}  >  ${{ steps.vars.outputs.container-lint-filename }}
 
       - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container-amd64
           path: "*.tar"
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index f18a69c192ee9..6ecad5536109b 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: 3.13
+          python-version: 3.14
           cache: 'pip'
       - name: Install Python Dependencies
         run: |
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b5f3413fe3b6b..7374777cb759c 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -97,7 +97,7 @@ jobs:
       - name: Setup Python env
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.13'
+          python-version: '3.14'
           cache: 'pip'
           cache-dependency-path: 'llvm/docs/requirements-hashed.txt'
       - name: Install python dependencies
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
index 63388ebc706bd..6d490ca2c4b29 100644
--- a/.github/workflows/gha-codeql.yml
+++ b/.github/workflows/gha-codeql.yml
@@ -29,9 +29,9 @@ jobs:
           sparse-checkout: |
             .github/
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
+        uses: github/codeql-action/init@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2
         with:
           languages: actions
           queries: security-extended
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
+        uses: github/codeql-action/analyze@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index dcb852312d41a..cdc951658b4d2 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -80,7 +80,7 @@ jobs:
             ninja check-hlsl-unit
             ninja ${{ inputs.TestTarget }}
       - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0
+        uses: EnricoMi/publish-unit-test-result-action/macos@34d7c956a59aed1bfebf31df77b8de55db9bbaaf # v2.21.0
         if: always() && runner.os == 'macOS'
         with:
           comment_mode: off
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index 312cb47fc3d93..4bce86145fc0c 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -55,7 +55,7 @@ jobs:
         TAG: ${{ github.sha }}
 
     - name: Log in to GitHub Container Registry
-      uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
+      uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
       with:
         registry: ghcr.io
         username: ${{ github.actor }}
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index 9e8f55859fc7a..e2ca940d2f0b3 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -35,7 +35,7 @@ jobs:
     steps:
       - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: '3.13'
+          python-version: '3.14'
 
       - name: Extract information from the PR
         id: vars
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index fa73b9d9fe8d0..d1a017ab7b553 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -68,7 +68,7 @@ jobs:
     # due to https://github.com/actions/runner-images/issues/10385
     - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
-        python-version: '3.13'
+        python-version: '3.14'
 
     - name: Checkout LLVM
       uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -173,7 +173,7 @@ jobs:
         ref: ${{ needs.prepare.outputs.ref }}
 
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main
+      uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
 
     - name: Setup Windows
       if: startsWith(runner.os, 'Windows')
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index c07df338cf989..bd3277a8b452c 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
+        uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3
         with:
           results_file: results.sarif
           results_format: sarif

From 97660c109470aeab78d9602328b7dc7dbb0f9091 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Mon, 3 Nov 2025 09:19:33 -0800
Subject: [PATCH 039/313] [BOLT] Issue error on unclaimed PC-relative
 relocation (#166098)

Replace assert with an error and improve the report when unclaimed
PC-relative relocation is left in strict mode.
---
 bolt/lib/Core/BinaryContext.cpp  | 18 +++++++++++-------
 bolt/test/X86/unclaimed-pc-rel.s | 24 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 7 deletions(-)
 create mode 100644 bolt/test/X86/unclaimed-pc-rel.s

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index a383ced1712e3..c7cd034a30410 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -778,13 +778,17 @@ void BinaryContext::populateJumpTables() {
   }
 
   if (opts::StrictMode && DataPCRelocations.size()) {
-    LLVM_DEBUG({
-      dbgs() << DataPCRelocations.size()
-             << " unclaimed PC-relative relocations left in data:\n";
-      for (uint64_t Reloc : DataPCRelocations)
-        dbgs() << Twine::utohexstr(Reloc) << '\n';
-    });
-    assert(0 && "unclaimed PC-relative relocations left in data\n");
+    this->errs() << "BOLT-ERROR: " << DataPCRelocations.size()
+                 << " unclaimed PC-relative relocation(s) left in data";
+    if (opts::Verbosity) {
+      this->errs() << ":\n";
+      for (uint64_t RelocOffset : DataPCRelocations)
+        this->errs() << "  @0x" << Twine::utohexstr(RelocOffset) << '\n';
+    } else {
+      this->errs() << ". Re-run with -v=1 to see the list\n";
+    }
+    this->errs() << "BOLT-ERROR: unable to proceed with --strict\n";
+    exit(1);
   }
   clearList(DataPCRelocations);
 }
diff --git a/bolt/test/X86/unclaimed-pc-rel.s b/bolt/test/X86/unclaimed-pc-rel.s
new file mode 100644
index 0000000000000..5292cccba754d
--- /dev/null
+++ b/bolt/test/X86/unclaimed-pc-rel.s
@@ -0,0 +1,24 @@
+## Check that unclaimed PC-relative relocation from data to code is detected
+## and reported to the user.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q -nostartfiles
+# RUN: not llvm-bolt %t.exe -o %t.bolt --strict 2>&1 | FileCheck %s
+
+# CHECK: BOLT-ERROR: 1 unclaimed PC-relative relocation(s) left in data
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  movl $42, %eax
+.L0:
+  ret
+  .size _start, .-_start
+
+## Force relocation mode.
+  .reloc 0, R_X86_64_NONE
+
+  .section .rodata
+  .long .L0-.

From a85ecfa7a8ca20bd7710ab55e88ff7a60c63a5a3 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 3 Nov 2025 18:19:44 +0100
Subject: [PATCH 040/313] [clang][NFC] Fix typos in clang release notes
 (#166064)

Fix typos in clang release notes
---
 clang/docs/ReleaseNotes.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6959e61cac980..c3740a4a027bd 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -355,7 +355,7 @@ Improvements to Clang's diagnostics
   potential misaligned members get processed before they can get discarded.
   (#GH144729)
 
-- Clang now emits dignostic with correct message in case of assigning to const reference captured in lambda. (#GH105647)
+- Clang now emits a diagnostic with the correct message in case of assigning to const reference captured in lambda. (#GH105647)
 
 - Fixed false positive in ``-Wmissing-noreturn`` diagnostic when it was requiring the usage of
   ``[[noreturn]]`` on lambdas before C++23 (#GH154493).
@@ -471,7 +471,7 @@ Bug Fixes to C++ Support
   casts that are guaranteed to fail (#GH137518).
 - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190).
 - Fix a crash if errors "member of anonymous [...] redeclares" and
-  "intializing multiple members of union" coincide (#GH149985).
+  "initializing multiple members of union" coincide (#GH149985).
 - Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729)
 - Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445)
 - Fixed a bug that caused ``this`` captured by value in a lambda with a dependent explicit object parameter to not be

From 6a275de13f6cdf927cf3cd6125ea858ddc0c4b1d Mon Sep 17 00:00:00 2001
From: Matthew Nagy <matthew.nagy@sony.com>
Date: Mon, 3 Nov 2025 17:24:55 +0000
Subject: [PATCH 041/313] =?UTF-8?q?Revert=20"[UBSan]=20Improve=20error=20m?=
 =?UTF-8?q?essage=20when=20a=20misalignment=20is=20due=20to=20t=E2=80=A6?=
 =?UTF-8?q?=20(#166197)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…arget de…"

This reverts commit 47c54d55c9fac5ea7c87881e00f96e8c12b18174.
---
 clang/lib/CodeGen/CGExprCXX.cpp               | 21 +++--------
 clang/lib/CodeGen/CodeGenFunction.h           |  5 +--
 compiler-rt/lib/ubsan/ubsan_checks.inc        |  1 -
 compiler-rt/lib/ubsan/ubsan_handlers.cpp      | 33 ++++-------------
 .../TestCases/TypeCheck/minimum-alignment.cpp | 36 -------------------
 .../ubsan/TestCases/TypeCheck/misaligned.cpp  |  2 +-
 6 files changed, 12 insertions(+), 86 deletions(-)
 delete mode 100644 compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp

diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index f2dd22e9bed3b..14d8db32bafc6 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -18,9 +18,6 @@
 #include "ConstantEmitter.h"
 #include "TargetInfo.h"
 #include "clang/Basic/CodeGenOptions.h"
-#include "clang/Basic/Sanitizers.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "llvm/IR/Intrinsics.h"
 
@@ -1752,17 +1749,6 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
       allocator->isReservedGlobalPlacementOperator())
     result = Builder.CreateLaunderInvariantGroup(result);
 
-  // Check the default alignment of the type and why. Users may incorrectly
-  // return misaligned memory from a replaced operator new without knowing
-  // about default alignment.
-  TypeCheckKind checkKind = CodeGenFunction::TCK_ConstructorCall;
-  const TargetInfo &TI = getContext().getTargetInfo();
-  unsigned DefaultTargetAlignment = TI.getNewAlign() / TI.getCharWidth();
-  if (SanOpts.has(SanitizerKind::Alignment) &&
-      (DefaultTargetAlignment >
-       CGM.getContext().getTypeAlignInChars(allocType).getQuantity()))
-    checkKind = CodeGenFunction::TCK_ConstructorCallMinimumAlign;
-
   // Emit sanitizer checks for pointer value now, so that in the case of an
   // array it was checked only once and not at each constructor call. We may
   // have already checked that the pointer is non-null.
@@ -1770,9 +1756,10 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
   // we'll null check the wrong pointer here.
   SanitizerSet SkippedChecks;
   SkippedChecks.set(SanitizerKind::Null, nullCheck);
-  EmitTypeCheck(
-      checkKind, E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(),
-      result, allocType, result.getAlignment(), SkippedChecks, numElements);
+  EmitTypeCheck(CodeGenFunction::TCK_ConstructorCall,
+                E->getAllocatedTypeSourceInfo()->getTypeLoc().getBeginLoc(),
+                result, allocType, result.getAlignment(), SkippedChecks,
+                numElements);
 
   EmitNewInitializer(*this, E, allocType, elementTy, result, numElements,
                      allocSizeWithoutCookie);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 047ca844c79de..8c4c1c8c2dc95 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3296,10 +3296,7 @@ class CodeGenFunction : public CodeGenTypeCache {
     TCK_NonnullAssign,
     /// Checking the operand of a dynamic_cast or a typeid expression.  Must be
     /// null or an object within its lifetime.
-    TCK_DynamicOperation,
-    /// Checking the 'this' poiner for a constructor call, including that the
-    /// alignment is greater or equal to the targets minimum alignment
-    TCK_ConstructorCallMinimumAlign
+    TCK_DynamicOperation
   };
 
   /// Determine whether the pointer type check \p TCK permits null pointers.
diff --git a/compiler-rt/lib/ubsan/ubsan_checks.inc b/compiler-rt/lib/ubsan/ubsan_checks.inc
index f8757d781afb8..b1d09a9024e7e 100644
--- a/compiler-rt/lib/ubsan/ubsan_checks.inc
+++ b/compiler-rt/lib/ubsan/ubsan_checks.inc
@@ -28,7 +28,6 @@ UBSAN_CHECK(NullptrAfterNonZeroOffset, "nullptr-after-nonzero-offset",
 UBSAN_CHECK(PointerOverflow, "pointer-overflow", "pointer-overflow")
 UBSAN_CHECK(MisalignedPointerUse, "misaligned-pointer-use", "alignment")
 UBSAN_CHECK(AlignmentAssumption, "alignment-assumption", "alignment")
-UBSAN_CHECK(MinumumAssumedAlignment, "minimum-assumed-alignment", "alignment")
 UBSAN_CHECK(InsufficientObjectSize, "insufficient-object-size", "object-size")
 UBSAN_CHECK(SignedIntegerOverflow, "signed-integer-overflow",
             "signed-integer-overflow")
diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
index fc6063af4562b..63319f46734a4 100644
--- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
@@ -73,26 +73,14 @@ enum TypeCheckKind {
   TCK_NonnullAssign,
   /// Checking the operand of a dynamic_cast or a typeid expression.  Must be
   /// null or an object within its lifetime.
-  TCK_DynamicOperation,
-  /// Checking the 'this' poiner for a constructor call, including that the
-  /// alignment is greater or equal to the targets minimum alignment
-  TCK_ConstructorCallMinimumAlign
+  TCK_DynamicOperation
 };
 
 extern const char *const TypeCheckKinds[] = {
-    "load of",
-    "store to",
-    "reference binding to",
-    "member access within",
-    "member call on",
-    "constructor call on",
-    "downcast of",
-    "downcast of",
-    "upcast of",
-    "cast to virtual base of",
-    "_Nonnull binding to",
-    "dynamic operation on",
-    "constructor call with pointer from operator new on"};
+    "load of", "store to", "reference binding to", "member access within",
+    "member call on", "constructor call on", "downcast of", "downcast of",
+    "upcast of", "cast to virtual base of", "_Nonnull binding to",
+    "dynamic operation on"};
 }
 
 static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer,
@@ -106,9 +94,7 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer,
              ? ErrorType::NullPointerUseWithNullability
              : ErrorType::NullPointerUse;
   else if (Pointer & (Alignment - 1))
-    ET = (Data->TypeCheckKind == TCK_ConstructorCallMinimumAlign)
-             ? ErrorType::MinumumAssumedAlignment
-             : ErrorType::MisalignedPointerUse;
+    ET = ErrorType::MisalignedPointerUse;
   else
     ET = ErrorType::InsufficientObjectSize;
 
@@ -131,13 +117,6 @@ static void handleTypeMismatchImpl(TypeMismatchData *Data, ValueHandle Pointer,
     Diag(Loc, DL_Error, ET, "%0 null pointer of type %1")
         << TypeCheckKinds[Data->TypeCheckKind] << Data->Type;
     break;
-  case ErrorType::MinumumAssumedAlignment:
-    Diag(Loc, DL_Error, ET,
-         "%0 misaligned address %1 for type %2, "
-         "which requires target minimum assumed %3 byte alignment")
-        << TypeCheckKinds[Data->TypeCheckKind] << (void *)Pointer << Data->Type
-        << Alignment;
-    break;
   case ErrorType::MisalignedPointerUse:
     Diag(Loc, DL_Error, ET, "%0 misaligned address %1 for type %3, "
                         "which requires %2 byte alignment")
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp
deleted file mode 100644
index 4642126ab74c4..0000000000000
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/minimum-alignment.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// RUN: %clangxx %gmlt -fsanitize=alignment %s -o %t
-// RUN: %run %t 2>&1 | FileCheck %s
-
-// UNSUPPORTED: i386
-// UNSUPPORTED: armv7l
-
-// These sanitizers already overload the new operator so won't compile this test
-// UNSUPPORTED: ubsan-msan
-// UNSUPPORTED: ubsan-tsan
-
-#include <cassert>
-#include <cstdlib>
-
-void *operator new(std::size_t count) {
-  constexpr const size_t offset = 8;
-
-  // allocate a bit more so we can safely offset it
-  void *ptr = std::malloc(count + offset);
-
-  // verify malloc returned 16 bytes aligned mem
-  static_assert(__STDCPP_DEFAULT_NEW_ALIGNMENT__ == 16);
-  assert(((std::ptrdiff_t)ptr & (__STDCPP_DEFAULT_NEW_ALIGNMENT__ - 1)) == 0);
-
-  return (char *)ptr + offset;
-}
-
-struct Foo {
-  void *_cookie1, *_cookie2;
-};
-
-static_assert(alignof(Foo) == 8);
-int main() {
-  // CHECK: runtime error: constructor call with pointer from operator new on misaligned address 0x{{.*}} for type 'Foo', which requires target minimum assumed 16 byte alignment
-  Foo *f = new Foo;
-  return 0;
-}
diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
index 4b0b2b5923c6f..e39a0ab4e6589 100644
--- a/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
+++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/misaligned.cpp
@@ -101,7 +101,7 @@ int main(int, char **argv) {
     return s->f() && 0;
 
   case 'n':
-    // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call with pointer from operator new on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires target minimum assumed 4 byte alignment
+    // CHECK-NEW: misaligned.cpp:[[@LINE+4]]{{(:21)?}}: runtime error: constructor call on misaligned address [[PTR:0x[0-9a-f]*]] for type 'S', which requires 4 byte alignment
     // CHECK-NEW-NEXT: [[PTR]]: note: pointer points here
     // CHECK-NEW-NEXT: {{^ 00 00 00 01 02 03 04  05}}
     // CHECK-NEW-NEXT: {{^             \^}}

From 513334faec2594bbeb3ac00f0092bed20b23abd3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Mon, 3 Nov 2025 19:30:04 +0200
Subject: [PATCH 042/313] [NFC][SPIRV] Fix function type recovery (#165934)

Due to limitations in GISel / IRTranslator, the SPIR-V BE replaces aggregate function args with `i32` placeholders, which are subsequently used to retrieve the original type after IR translation, from metadata. Due to what appears to be an oversight, the current implementation only handles a single mutation, as it does not traverse the metadata, but rather only takes the first operand. This patch addresses that limitation by correctly iterating the metadata.
---
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   | 30 +++++++++----------
 .../pointers/composite-fun-fix-ptr-arg.ll     | 14 +++++++++
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 9e11c3a281a1b..dd57b74d79a5e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) {
         return isa<MDString>(N->getOperand(0)) &&
                cast<MDString>(N->getOperand(0))->getString() == F.getName();
       });
-  // TODO: probably one function can have numerous type mutations,
-  // so we should support this.
   if (ThisFuncMDIt != NamedMD->op_end()) {
     auto *ThisFuncMD = *ThisFuncMDIt;
-    MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1));
-    assert(MD && "MDNode operand is expected");
-    ConstantInt *Const = getConstInt(MD, 0);
-    if (Const) {
-      auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
-      assert(CMeta && "ConstantAsMetadata operand is expected");
-      assert(Const->getSExtValue() >= -1);
-      // Currently -1 indicates return value, greater values mean
-      // argument numbers.
-      if (Const->getSExtValue() == -1)
-        RetTy = CMeta->getType();
-      else
-        ArgTypes[Const->getSExtValue()] = CMeta->getType();
+    for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) {
+      MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I));
+      assert(MD && "MDNode operand is expected");
+      ConstantInt *Const = getConstInt(MD, 0);
+      if (Const) {
+        auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+        assert(CMeta && "ConstantAsMetadata operand is expected");
+        assert(Const->getSExtValue() >= -1);
+        // Currently -1 indicates return value, greater values mean
+        // argument numbers.
+        if (Const->getSExtValue() == -1)
+          RetTy = CMeta->getType();
+        else
+          ArgTypes[Const->getSExtValue()] = CMeta->getType();
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
index 73c46b18bfa78..c9b2968a4aed7 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
@@ -10,6 +10,7 @@
 
 ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]]
 ; CHECK-DAG: %[[#Void:]] = OpTypeVoid
 ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]]
@@ -17,12 +18,20 @@
 ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]]
 ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]]
+; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#FooType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt8]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#BarType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt64]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunction %[[#Void]] None %[[#BazType]]
+; CHECK: OpFunctionParameter %[[#PtrInt8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Int8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Float]]
+; CHECK: OpFunctionParameter %[[#Struct]]
 
 %t_half = type { half }
 
@@ -38,4 +47,9 @@ entry:
   ret void
 }
 
+define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) {
+entry:
+  ret void
+}
+
 declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half)

From c4763e2b9038fbd1154f0276a8b9542b8c115111 Mon Sep 17 00:00:00 2001
From: Alan Zhao <ayzhao@google.com>
Date: Mon, 3 Nov 2025 09:32:42 -0800
Subject: [PATCH 043/313] [profcheck][InstCombine] Preserve branch weights in
 logical identities (#165810)

For the simplification
```
(C && A) || (!C && B) --> sel C, A, B
```
(and related), if `C` (or (`!C`)) is the condition in the select
instruction representing the logical and, we can preserve that logical
and's branch weights when emitting the new instruction. Otherwise, the
profile data is unknown.

If `C` is the condition of both logical ands, then we just take the
branch weights of the first logical and (though in practice they should
be equal.)

Furthermore, `select-safe-transforms.ii` now passes under the profcheck
configuration, so we remove it from the failing tests.

Tracking issue: #147390
---
 .../InstCombine/InstCombineSelect.cpp         | 29 +++++++++++
 .../InstCombine/select-safe-transforms.ll     | 51 ++++++++++---------
 llvm/utils/profcheck-xfail.txt                |  1 -
 3 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f5130da818746..9572f9d702e1b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
                                  m_Not(m_Specific(SelCond->getTrueValue())));
       if (MayNeedFreeze)
         C = Builder.CreateFreeze(C);
+      if (!ProfcheckDisableMetadataFixes) {
+        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr;
+        if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) &&
+            SelCond) {
+          return SelectInst::Create(C, A, B, "", nullptr, SelCond);
+        } else if (match(FalseVal,
+                         m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) &&
+                   SelFVal) {
+          SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal);
+          NewSI->swapProfMetadata();
+          return NewSI;
+        } else {
+          return createSelectInstWithUnknownProfile(C, A, B);
+        }
+      }
       return SelectInst::Create(C, A, B);
     }
 
@@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
                                  m_Not(m_Specific(SelFVal->getTrueValue())));
       if (MayNeedFreeze)
         C = Builder.CreateFreeze(C);
+      if (!ProfcheckDisableMetadataFixes) {
+        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr;
+        if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) &&
+            SelCond) {
+          SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond);
+          NewSI->swapProfMetadata();
+          return NewSI;
+        } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) &&
+                   SelFVal) {
+          return SelectInst::Create(C, B, A, "", nullptr, SelFVal);
+        } else {
+          return createSelectInstWithUnknownProfile(C, B, A);
+        }
+      }
       return SelectInst::Create(C, B, A);
     }
   }
diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
index 3d97048f43127..8b3c0502ac04d 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
@@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) {
   ret <2 x i1> %and
 }
 
-define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
-  %and1 = select i1 %not, i1 %a, i1 false
-  %and2 = select i1 %c, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and1 = select i1 %not, i1 %a, i1 false, !prof!1
+  %and2 = select i1 %c, i1 %b, i1 false, !prof !2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !3
   ret i1 %or
 }
 
-define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0_and1(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %not, %a
-  %and2 = select i1 %c, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and2 = select i1 %c, i1 %b, i1 false, !prof !1
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !2
   ret i1 %or
 }
 
@@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0_and1_and2(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %not, %a
   %and2 = and i1 %c, %b
-  %or = select i1 %and1, i1 true, i1 %and2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !1
   ret i1 %or
 }
 
@@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
-  %and1 = select i1 %c, i1 %a, i1 false
-  %and2 = select i1 %not, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and1 = select i1 %c, i1 %a, i1 false, !prof !1
+  %and2 = select i1 %not, i1 %b, i1 false, !prof !2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !3
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0_and1(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %c, %a
-  %and2 = select i1 %not, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and2 = select i1 %not, i1 %b, i1 false, !prof !1
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !2
   ret i1 %or
 }
 
@@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0_and1_and2(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %c, %a
   %and2 = and i1 %not, %b
-  %or = select i1 %and1, i1 true, i1 %and2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !1
   ret i1 %or
 }
 
@@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) {
 
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 2, i32 3}
+!2 = !{!"branch_weights", i32 5, i32 7}
+!3 = !{!"branch_weights", i32 11, i32 13}
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
 ; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2}
+; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"}
 ;.
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 380b162d8c58c..661c88125c9c8 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -917,7 +917,6 @@ Transforms/InstCombine/select_frexp.ll
 Transforms/InstCombine/select.ll
 Transforms/InstCombine/select-min-max.ll
 Transforms/InstCombine/select-of-symmetric-selects.ll
-Transforms/InstCombine/select-safe-transforms.ll
 Transforms/InstCombine/select-select.ll
 Transforms/InstCombine/select-with-extreme-eq-cond.ll
 Transforms/InstCombine/shift.ll

From e8765401d49cf0b5775391f2eaa8e44a292e2caf Mon Sep 17 00:00:00 2001
From: vangthao95 <vang.thao@amd.com>
Date: Mon, 3 Nov 2025 09:36:49 -0800
Subject: [PATCH 044/313] [AMDGPU][GlobalISel] Add RegBankLegalize support for
 G_FENCE (#165939)

---
 .../Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp   |  2 ++
 .../GlobalISel/memory-legalizer-atomic-fence.ll    | 14 +++++++-------
 llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll        |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index dd474ac52c3c8..1e5885a25c195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
   addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
       .Uni(S64, {{Sgpr64}, {}});
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f7473363f7..37b5422be7e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9f6fe81..61a61376d7ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
 
 declare void @readsMem(ptr) #0
 declare void @writesMem(ptr) #1

From 6747ea050dfc917b78c205102d9545902140ea2d Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 3 Nov 2025 09:50:28 -0800
Subject: [PATCH 045/313] [CAS] Add UnifiedOnDiskCache and OnDiskCAS (#114103)

Add a new abstraction layer UnifiedOnDiskCache that adds new functions
of disk space management and data validation that builds on top of
OnDiskGraphDB and OnDiskKeyValueDB.

Build upon UnifiedOnDiskCache, it is OnDiskCAS that implements
ObjectStore and ActionCache interface for LLVM tools to interact with
CAS storage.
---
 llvm/include/llvm/CAS/ActionCache.h           |   6 +
 .../llvm/CAS/BuiltinUnifiedCASDatabases.h     |  59 ++
 llvm/include/llvm/CAS/ObjectStore.h           |  47 +-
 llvm/include/llvm/CAS/OnDiskGraphDB.h         |  12 +-
 llvm/include/llvm/CAS/OnDiskKeyValueDB.h      |  15 +-
 llvm/include/llvm/CAS/UnifiedOnDiskCache.h    | 172 +++++
 llvm/lib/CAS/ActionCaches.cpp                 | 166 +++++
 llvm/lib/CAS/BuiltinCAS.cpp                   |  14 +-
 llvm/lib/CAS/BuiltinCAS.h                     |  25 +-
 llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp   |  38 ++
 llvm/lib/CAS/CMakeLists.txt                   |   3 +
 llvm/lib/CAS/InMemoryCAS.cpp                  |   8 +
 llvm/lib/CAS/ObjectStore.cpp                  |  93 ++-
 llvm/lib/CAS/OnDiskCAS.cpp                    | 211 ++++++
 llvm/lib/CAS/OnDiskGraphDB.cpp                |  34 +-
 llvm/lib/CAS/OnDiskKeyValueDB.cpp             |  21 +-
 llvm/lib/CAS/UnifiedOnDiskCache.cpp           | 613 ++++++++++++++++++
 llvm/unittests/CAS/ActionCacheTest.cpp        |   6 +-
 .../CAS/BuiltinUnifiedCASDatabasesTest.cpp    |  67 ++
 llvm/unittests/CAS/CASTestConfig.cpp          |  17 +-
 llvm/unittests/CAS/CASTestConfig.h            |  41 +-
 llvm/unittests/CAS/CMakeLists.txt             |   2 +
 llvm/unittests/CAS/ObjectStoreTest.cpp        | 134 +++-
 llvm/unittests/CAS/OnDiskCommonUtils.h        |  21 +
 llvm/unittests/CAS/OnDiskGraphDBTest.cpp      |  12 +-
 llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp | 198 ++++++
 26 files changed, 1970 insertions(+), 65 deletions(-)
 create mode 100644 llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
 create mode 100644 llvm/include/llvm/CAS/UnifiedOnDiskCache.h
 create mode 100644 llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
 create mode 100644 llvm/lib/CAS/OnDiskCAS.cpp
 create mode 100644 llvm/lib/CAS/UnifiedOnDiskCache.cpp
 create mode 100644 llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
 create mode 100644 llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp

diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
index 69ee4dde1974a..7f5b11223c54d 100644
--- a/llvm/include/llvm/CAS/ActionCache.h
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -75,6 +75,9 @@ class ActionCache {
                    CanBeDistributed);
   }
 
+  /// Validate the ActionCache contents.
+  virtual Error validate() const = 0;
+
   virtual ~ActionCache() = default;
 
 protected:
@@ -97,6 +100,9 @@ class ActionCache {
 /// Create an action cache in memory.
 std::unique_ptr<ActionCache> createInMemoryActionCache();
 
+/// Create an action cache on disk.
+Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
+
 } // end namespace llvm::cas
 
 #endif // LLVM_CAS_ACTIONCACHE_H
diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
new file mode 100644
index 0000000000000..6c165c421b168
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+class ActionCache;
+class ObjectStore;
+
+/// Create on-disk \c ObjectStore and \c ActionCache instances based on
+/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+createOnDiskUnifiedCASDatabases(StringRef Path);
+
+/// Represents the result of validating the contents using
+/// \c validateOnDiskUnifiedCASDatabasesIfNeeded.
+///
+/// Note: invalid results are handled as an \c Error.
+enum class ValidationResult {
+  /// The data is already valid.
+  Valid,
+  /// The data was invalid, but was recovered.
+  Recovered,
+  /// Validation was skipped, as it was not needed.
+  Skipped,
+};
+
+/// Validate the data in \p Path, if needed to ensure correctness.
+///
+/// \param Path directory for the on-disk database.
+/// \param CheckHash Whether to validate hashes match the data.
+/// \param AllowRecovery Whether to automatically recover from invalid data by
+/// marking the files for garbage collection.
+/// \param ForceValidation Whether to force validation to occur even if it
+/// should not be necessary.
+/// \param LLVMCasBinaryPath If provided, validation is performed out-of-process
+/// using the given \c llvm-cas executable which protects against crashes
+/// during validation. Otherwise validation is performed in-process.
+///
+/// \returns \c Valid if the data is already valid, \c Recovered if data
+/// was invalid but has been cleared, \c Skipped if validation is not needed,
+/// or an \c Error if validation cannot be performed or if the data is left
+/// in an invalid state because \p AllowRecovery is false.
+Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded(
+    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinaryPath);
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
index 6db5dd3904095..29950fe9d9029 100644
--- a/llvm/include/llvm/CAS/ObjectStore.h
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -5,6 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the ObjectStore class.
+///
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CAS_OBJECTSTORE_H
 #define LLVM_CAS_OBJECTSTORE_H
@@ -111,7 +116,10 @@ class ObjectStore {
   virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
 
   /// Validate the underlying object referred by CASID.
-  virtual Error validate(const CASID &ID) = 0;
+  virtual Error validateObject(const CASID &ID) = 0;
+
+  /// Validate the entire ObjectStore.
+  virtual Error validate(bool CheckHash) const = 0;
 
 protected:
   /// Load the object referenced by \p Ref.
@@ -215,9 +223,39 @@ class ObjectStore {
     return Data.size();
   }
 
+  /// Set the size for limiting growth of on-disk storage. This has an effect
+  /// for when the instance is closed.
+  ///
+  /// Implementations may leave this unimplemented.
+  virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) {
+    return Error::success();
+  }
+
+  /// \returns the storage size of the on-disk CAS data.
+  ///
+  /// Implementations that don't have an implementation for this should return
+  /// \p std::nullopt.
+  virtual Expected<std::optional<uint64_t>> getStorageSize() const {
+    return std::nullopt;
+  }
+
+  /// Prune local storage to reduce its size according to the desired size
+  /// limit. Pruning can happen concurrently with other operations.
+  ///
+  /// Implementations may leave this unimplemented.
+  virtual Error pruneStorageData() { return Error::success(); }
+
   /// Validate the whole node tree.
   Error validateTree(ObjectRef Ref);
 
+  /// Import object from another CAS. This will import the full tree from the
+  /// other CAS.
+  Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other);
+
+  /// Print the ObjectStore internals for debugging purpose.
+  virtual void print(raw_ostream &) const {}
+  void dump() const;
+
   /// Get CASContext
   const CASContext &getContext() const { return Context; }
 
@@ -290,8 +328,15 @@ class ObjectProxy {
   ObjectHandle H;
 };
 
+/// Create an in memory CAS.
 std::unique_ptr<ObjectStore> createInMemoryCAS();
 
+/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
+bool isOnDiskCASEnabled();
+
+/// Create a persistent on-disk path at \p Path.
+Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path);
+
 } // namespace cas
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
index 5f0ee0e131c0f..76cc528711b69 100644
--- a/llvm/include/llvm/CAS/OnDiskGraphDB.h
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -340,13 +340,16 @@ class OnDiskGraphDB {
   /// \param HashByteSize Size for the object digest hash bytes.
   /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
   /// if they don't exist in the primary store. The upstream store is only used
-  /// for reading nodes, new nodes are only written to the primary store.
+  /// for reading nodes, new nodes are only written to the primary store. User
+  /// need to make sure \p UpstreamDB outlives current instance of
+  /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to
+  /// manage both.
   /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
   /// to primary store. This is recorded at creation time and subsequent opens
   /// need to pass the same policy otherwise the \p open will fail.
   static Expected<std::unique_ptr<OnDiskGraphDB>>
   open(StringRef Path, StringRef HashName, unsigned HashByteSize,
-       std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+       OnDiskGraphDB *UpstreamDB = nullptr,
        FaultInPolicy Policy = FaultInPolicy::FullTree);
 
   ~OnDiskGraphDB();
@@ -438,8 +441,7 @@ class OnDiskGraphDB {
 
   // Private constructor.
   OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
-                OnDiskDataAllocator DataPool,
-                std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB,
                 FaultInPolicy Policy);
 
   /// Mapping from hash to object reference.
@@ -459,7 +461,7 @@ class OnDiskGraphDB {
   std::string RootPath;
 
   /// Optional on-disk store to be used for faulting-in nodes.
-  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  OnDiskGraphDB *UpstreamDB = nullptr;
 
   /// The policy used to fault in data from upstream.
   FaultInPolicy FIPolicy;
diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
index b762518366c21..17ae52f0307fc 100644
--- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -19,6 +19,8 @@
 
 namespace llvm::cas::ondisk {
 
+class UnifiedOnDiskCache;
+
 /// An on-disk key-value data store with the following properties:
 /// * Keys are fixed length binary hashes with expected normal distribution.
 /// * Values are buffers of the same size, specified at creation time.
@@ -59,9 +61,13 @@ class OnDiskKeyValueDB {
   /// \param KeySize Size for the key hash bytes.
   /// \param ValueName Identifier name for the values.
   /// \param ValueSize Size for the value bytes.
+  /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size
+  /// and lifetime of the CAS instance and it must owns current initializing
+  /// KeyValueDB after initialized.
   static Expected<std::unique_ptr<OnDiskKeyValueDB>>
   open(StringRef Path, StringRef HashName, unsigned KeySize,
-       StringRef ValueName, size_t ValueSize);
+       StringRef ValueName, size_t ValueSize,
+       UnifiedOnDiskCache *UnifiedCache = nullptr);
 
   using CheckValueT =
       function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>;
@@ -70,11 +76,14 @@ class OnDiskKeyValueDB {
   Error validate(CheckValueT CheckValue) const;
 
 private:
-  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache)
-      : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache,
+                   UnifiedOnDiskCache *UnifiedCache)
+      : ValueSize(ValueSize), Cache(std::move(Cache)),
+        UnifiedCache(UnifiedCache) {}
 
   const size_t ValueSize;
   OnDiskTrieRawHashMap Cache;
+  UnifiedOnDiskCache *UnifiedCache = nullptr;
 };
 
 } // namespace llvm::cas::ondisk
diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
new file mode 100644
index 0000000000000..6e0878a65fe72
--- /dev/null
+++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
@@ -0,0 +1,172 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H
+#define LLVM_CAS_UNIFIEDONDISKCACHE_H
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include <atomic>
+
+namespace llvm::cas::ondisk {
+
+class OnDiskKeyValueDB;
+
+/// A unified CAS nodes and key-value database, using on-disk storage for both.
+/// It manages storage growth and provides APIs for garbage collection.
+///
+/// High-level properties:
+/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the
+///   storage size in that directory will keep growing unrestricted. For data to
+///   become eligible for garbage-collection there should be no open instances
+///   of \p UnifiedOnDiskCache for that directory, by any process.
+/// * Garbage-collection needs to be triggered explicitly by the client. It can
+///   be triggered on a directory concurrently, at any time and by any process,
+///   without affecting any active readers/writers, in the same process or other
+///   processes.
+///
+/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open
+/// for a limited period of time, e.g. for the duration of a build operation.
+/// For long-living processes that need periodic access to a
+/// \p UnifiedOnDiskCache, the client should devise a scheme where access is
+/// performed within some defined period. For example, if a service is designed
+/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it
+/// could keep the instance alive while new requests are coming in but close it
+/// after a time period in which there are no new requests.
+class UnifiedOnDiskCache {
+public:
+  /// The \p OnDiskGraphDB instance for the open directory.
+  OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; }
+
+  /// The \p OnDiskGraphDB instance for the open directory.
+  OnDiskKeyValueDB &getKeyValueDB() { return *PrimaryKVDB; }
+
+  /// Open a \p UnifiedOnDiskCache instance for a directory.
+  ///
+  /// \param Path directory for the on-disk database. The directory will be
+  /// created if it doesn't exist.
+  /// \param SizeLimit Optional size for limiting growth. This has an effect for
+  /// when the instance is closed.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param FaultInPolicy Controls how nodes are copied to primary store. This
+  /// is recorded at creation time and subsequent opens need to pass the same
+  /// policy otherwise the \p open will fail.
+  static Expected<std::unique_ptr<UnifiedOnDiskCache>>
+  open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName,
+       unsigned HashByteSize,
+       OnDiskGraphDB::FaultInPolicy FaultInPolicy =
+           OnDiskGraphDB::FaultInPolicy::FullTree);
+
+  /// Validate the data in \p Path, if needed to ensure correctness.
+  ///
+  /// Note: if invalid data is detected and \p AllowRecovery is true, then
+  /// recovery requires exclusive access to the CAS and it is an error to
+  /// attempt recovery if there is concurrent use of the CAS.
+  ///
+  /// \param Path directory for the on-disk database.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param CheckHash Whether to validate hashes match the data.
+  /// \param AllowRecovery Whether to automatically recover from invalid data by
+  /// marking the files for garbage collection.
+  /// \param ForceValidation Whether to force validation to occur even if it
+  /// should not be necessary.
+  /// \param LLVMCasBinary If provided, validation is performed out-of-process
+  /// using the given \c llvm-cas executable which protects against crashes
+  /// during validation. Otherwise validation is performed in-process.
+  ///
+  /// \returns \c Valid if the data is already valid, \c Recovered if data
+  /// was invalid but has been cleared, \c Skipped if validation is not needed,
+  /// or an \c Error if validation cannot be performed or if the data is left
+  /// in an invalid state because \p AllowRecovery is false.
+  static Expected<ValidationResult>
+  validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize,
+                   bool CheckHash, bool AllowRecovery, bool ForceValidation,
+                   std::optional<StringRef> LLVMCasBinary);
+
+  /// This is called implicitly at destruction time, so it is not required for a
+  /// client to call this. After calling \p close the only method that is valid
+  /// to call is \p needsGarbageCollection.
+  ///
+  /// \param CheckSizeLimit if true it will check whether the primary store has
+  /// exceeded its intended size limit. If false the check is skipped even if a
+  /// \p SizeLimit was passed to the \p open call.
+  Error close(bool CheckSizeLimit = true);
+
+  /// Set the size for limiting growth. This has an effect for when the instance
+  /// is closed.
+  void setSizeLimit(std::optional<uint64_t> SizeLimit);
+
+  /// \returns the storage size of the cache data.
+  uint64_t getStorageSize() const;
+
+  /// \returns whether the primary store has exceeded the intended size limit.
+  /// This can return false even if the overall size of the opened directory is
+  /// over the \p SizeLimit passed to \p open. To know whether garbage
+  /// collection needs to be triggered or not, call \p needsGarbaseCollection.
+  bool hasExceededSizeLimit() const;
+
+  /// \returns whether there are unused data that can be deleted using a
+  /// \p collectGarbage call.
+  bool needsGarbageCollection() const { return NeedsGarbageCollection; }
+
+  /// Remove any unused data from the directory at \p Path. If there are no such
+  /// data the operation is a no-op.
+  ///
+  /// This can be called concurrently, regardless of whether there is an open
+  /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers
+  /// in the same process or other processes.
+  ///
+  /// It is recommended that garbage-collection is triggered concurrently in the
+  /// background, so that it has minimal effect on the workload of the process.
+  static Error collectGarbage(StringRef Path);
+
+  /// Remove unused data from the current UnifiedOnDiskCache.
+  Error collectGarbage();
+
+  /// Helper function to convert the value stored in KeyValueDB and ObjectID.
+  static ObjectID getObjectIDFromValue(ArrayRef<char> Value);
+
+  using ValueBytes = std::array<char, sizeof(uint64_t)>;
+  static ValueBytes getValueFromObjectID(ObjectID ID);
+
+  ~UnifiedOnDiskCache();
+
+private:
+  friend class OnDiskGraphDB;
+  friend class OnDiskKeyValueDB;
+
+  UnifiedOnDiskCache();
+
+  Expected<std::optional<ArrayRef<char>>>
+  faultInFromUpstreamKV(ArrayRef<uint8_t> Key);
+
+  /// \returns the storage size of the primary directory.
+  uint64_t getPrimaryStorageSize() const;
+
+  std::string RootPath;
+  std::atomic<uint64_t> SizeLimit;
+
+  int LockFD = -1;
+
+  std::atomic<bool> NeedsGarbageCollection;
+  std::string PrimaryDBDir;
+
+  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+
+  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp
index 571c5b3ca5b4b..003c850275ff4 100644
--- a/llvm/lib/CAS/ActionCaches.cpp
+++ b/llvm/lib/CAS/ActionCaches.cpp
@@ -13,7 +13,11 @@
 #include "BuiltinCAS.h"
 #include "llvm/ADT/TrieRawHashMap.h"
 #include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Errc.h"
 
 #define DEBUG_TYPE "cas-action-caches"
 
@@ -47,12 +51,54 @@ class InMemoryActionCache final : public ActionCache {
   Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
                                          bool CanBeDistributed) const final;
 
+  Error validate() const final {
+    return createStringError("InMemoryActionCache doesn't support validate()");
+  }
+
 private:
   using DataT = CacheEntry<sizeof(HashType)>;
   using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;
 
   InMemoryCacheT Cache;
 };
+
+/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB.
+class OnDiskActionCache final : public ActionCache {
+public:
+  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+                bool CanBeDistributed) final;
+  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+                                         bool CanBeDistributed) const final;
+
+  static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path);
+
+  Error validate() const final;
+
+private:
+  static StringRef getHashName() { return "BLAKE3"; }
+
+  OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB);
+
+  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+  using DataT = CacheEntry<sizeof(HashType)>;
+};
+
+/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide
+/// access to its ActionCache.
+class UnifiedOnDiskActionCache final : public ActionCache {
+public:
+  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+                bool CanBeDistributed) final;
+  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+                                         bool CanBeDistributed) const final;
+
+  UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+  Error validate() const final;
+
+private:
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+};
 } // end namespace
 
 static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash,
@@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() {
 }
 
 } // namespace llvm::cas
+
+OnDiskActionCache::OnDiskActionCache(
+    std::unique_ptr<ondisk::OnDiskKeyValueDB> DB)
+    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+      DB(std::move(DB)) {}
+
+Expected<std::unique_ptr<OnDiskActionCache>>
+OnDiskActionCache::create(StringRef AbsPath) {
+  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+  if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(),
+                                               sizeof(HashType), getHashName(),
+                                               sizeof(DataT))
+                    .moveInto(DB))
+    return std::move(E);
+  return std::unique_ptr<OnDiskActionCache>(
+      new OnDiskActionCache(std::move(DB)));
+}
+
+Expected<std::optional<CASID>>
+OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+                           bool /*CanBeDistributed*/) const {
+  std::optional<ArrayRef<char>> Val;
+  if (Error E = DB->get(Key).moveInto(Val))
+    return std::move(E);
+  if (!Val)
+    return std::nullopt;
+  return CASID::create(&getContext(), toStringRef(*Val));
+}
+
+Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+                                 bool /*CanBeDistributed*/) {
+  auto ResultHash = Result.getHash();
+  ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size());
+  ArrayRef<char> Observed;
+  if (Error E = DB->put(Key, Expected).moveInto(Observed))
+    return E;
+
+  if (Expected == Observed)
+    return Error::success();
+
+  return createResultCachePoisonedError(
+      Key, getContext(), Result,
+      ArrayRef((const uint8_t *)Observed.data(), Observed.size()));
+}
+
+Error OnDiskActionCache::validate() const {
+  // FIXME: without the matching CAS there is nothing we can check about the
+  // cached values. The hash size is already validated by the DB validator.
+  return DB->validate(nullptr);
+}
+
+UnifiedOnDiskActionCache::UnifiedOnDiskActionCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+      UniDB(std::move(UniDB)) {}
+
+Expected<std::optional<CASID>>
+UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+                                  bool /*CanBeDistributed*/) const {
+  std::optional<ArrayRef<char>> Val;
+  if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val))
+    return std::move(E);
+  if (!Val)
+    return std::nullopt;
+  auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val);
+  return CASID::create(&getContext(),
+                       toStringRef(UniDB->getGraphDB().getDigest(ID)));
+}
+
+Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key,
+                                        const CASID &Result,
+                                        bool /*CanBeDistributed*/) {
+  auto Expected = UniDB->getGraphDB().getReference(Result.getHash());
+  if (LLVM_UNLIKELY(!Expected))
+    return Expected.takeError();
+
+  auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected);
+  std::optional<ArrayRef<char>> Observed;
+  if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed))
+    return E;
+
+  auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed);
+  if (*Expected == ObservedID)
+    return Error::success();
+
+  return createResultCachePoisonedError(
+      Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID));
+}
+
+Error UnifiedOnDiskActionCache::validate() const {
+  auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error {
+    auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value);
+    auto formatError = [&](Twine Msg) {
+      return createStringError(
+          llvm::errc::illegal_byte_sequence,
+          "bad record at 0x" +
+              utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+              Msg.str());
+    };
+    if (ID.getOpaqueData() == 0)
+      return formatError("zero is not a valid ref");
+    return Error::success();
+  };
+  return UniDB->getKeyValueDB().validate(ValidateRef);
+}
+
+Expected<std::unique_ptr<ActionCache>>
+cas::createOnDiskActionCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return OnDiskActionCache::create(Path);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
+
+std::unique_ptr<ActionCache>
+cas::builtin::createActionCacheFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+  return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
index 73646ad2c3528..e9bc6d8beed4e 100644
--- a/llvm/lib/CAS/BuiltinCAS.cpp
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -9,6 +9,7 @@
 #include "BuiltinCAS.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/Process.h"
 
 using namespace llvm;
@@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
                    Refs, Data);
 }
 
-Error BuiltinCAS::validate(const CASID &ID) {
+Error BuiltinCAS::validateObject(const CASID &ID) {
   auto Ref = getReference(ID);
   if (!Ref)
     return createUnknownObjectError(ID);
@@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) {
 
   return Error::success();
 }
+
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt,
+                                          BuiltinCASContext::getHashName(),
+                                          sizeof(HashType));
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
index 3b5374d5e1850..4d2de66cf636f 100644
--- a/llvm/lib/CAS/BuiltinCAS.h
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -1,4 +1,4 @@
-//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,6 +15,9 @@
 
 namespace llvm::cas {
 class ActionCache;
+namespace ondisk {
+class UnifiedOnDiskCache;
+} // namespace ondisk
 namespace builtin {
 
 /// Common base class for builtin CAS implementations using the same CASContext.
@@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore {
                              "corrupt storage");
   }
 
-  Error validate(const CASID &ID) final;
+  Error validateObject(const CASID &ID) final;
 };
 
+/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing.
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+createBuiltinUnifiedOnDiskCache(StringRef Path);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+// FIXME: Proxy not portable. Maybe also error-prone?
+constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default";
+constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default";
+
 } // end namespace builtin
 } // end namespace llvm::cas
 
diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
new file mode 100644
index 0000000000000..f3f6fa043bc52
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "BuiltinCAS.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+cas::createOnDiskUnifiedCASDatabases(StringRef Path) {
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+  if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB))
+    return std::move(E);
+  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+  auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB));
+  return std::make_pair(std::move(CAS), std::move(AC));
+}
+
+Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded(
+    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinary) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return ondisk::UnifiedOnDiskCache::validateIfNeeded(
+      Path, builtin::BuiltinCASContext::getHashName(),
+      sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation,
+      LLVMCasBinary);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index a2f8c49e50145..aad77dce370d8 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  BuiltinUnifiedCASDatabases.cpp
   DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
+  OnDiskCAS.cpp
   OnDiskCommon.cpp
   OnDiskDataAllocator.cpp
   OnDiskGraphDB.cpp
   OnDiskKeyValueDB.cpp
   OnDiskTrieRawHashMap.cpp
+  UnifiedOnDiskCache.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index c63ee70de0849..2d4eedd5bdc8f 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS {
     return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
   }
 
+  void print(raw_ostream &OS) const final;
+
+  Error validate(bool CheckHash) const final {
+    return createStringError("InMemoryCAS doesn't support validate()");
+  }
+
   InMemoryCAS() = default;
 
 private:
@@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
   return cast<InMemoryInlineObject>(this)->getRefsImpl();
 }
 
+void InMemoryCAS::print(raw_ostream &OS) const {}
+
 Expected<ObjectRef>
 InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
                                            sys::fs::mapped_file_region Map) {
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index e0be50bbe013a..3110577e03774 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include <optional>
+#include <deque>
 
 using namespace llvm;
 using namespace llvm::cas;
@@ -21,6 +21,7 @@ void CASContext::anchor() {}
 void ObjectStore::anchor() {}
 
 LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }
 LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
 LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
 
@@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) {
     auto [I, Inserted] = ValidatedRefs.insert(Ref);
     if (!Inserted)
       continue; // already validated.
-    if (Error E = validate(getID(Ref)))
+    if (Error E = validateObject(getID(Ref)))
       return E;
     Expected<ObjectHandle> Obj = load(Ref);
     if (!Obj)
@@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) {
   return Error::success();
 }
 
+Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream,
+                                              ObjectRef Other) {
+  // Copy the full CAS tree from upstream with depth-first ordering to ensure
+  // all the child nodes are available in downstream CAS before inserting
+  // current object. This uses a similar algorithm as
+  // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema
+  // so it can be used to import from any other ObjectStore reguardless of the
+  // CAS schema.
+
+  // There is no work to do if importing from self.
+  if (this == &Upstream)
+    return Other;
+
+  /// Keeps track of the state of visitation for current node and all of its
+  /// parents. Upstream Cursor holds information only from upstream CAS.
+  struct UpstreamCursor {
+    ObjectRef Ref;
+    ObjectHandle Node;
+    size_t RefsCount;
+    std::deque<ObjectRef> Refs;
+  };
+  SmallVector<UpstreamCursor, 16> CursorStack;
+  /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either
+  /// just stored in the CAS or nodes already exists in the current CAS.
+  SmallVector<ObjectRef, 128> PrimaryRefStack;
+  /// A map from upstream ObjectRef to current ObjectRef.
+  llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects;
+
+  auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) {
+    unsigned NumRefs = Upstream.getNumRefs(Node);
+    std::deque<ObjectRef> Refs;
+    for (unsigned I = 0; I < NumRefs; ++I)
+      Refs.push_back(Upstream.readRef(Node, I));
+
+    CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)});
+  };
+
+  auto UpstreamHandle = Upstream.load(Other);
+  if (!UpstreamHandle)
+    return UpstreamHandle.takeError();
+  enqueueNode(Other, *UpstreamHandle);
+
+  while (!CursorStack.empty()) {
+    UpstreamCursor &Cur = CursorStack.back();
+    if (Cur.Refs.empty()) {
+      // Copy the node data into the primary store.
+      // The bottom of \p PrimaryRefStack contains the ObjectRef for the
+      // current node.
+      assert(PrimaryRefStack.size() >= Cur.RefsCount);
+      auto Refs = ArrayRef(PrimaryRefStack)
+                      .slice(PrimaryRefStack.size() - Cur.RefsCount);
+      auto NewNode = store(Refs, Upstream.getData(Cur.Node));
+      if (!NewNode)
+        return NewNode.takeError();
+
+      // Remove the current node and its IDs from the stack.
+      PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount);
+      CursorStack.pop_back();
+
+      PrimaryRefStack.push_back(*NewNode);
+      CreatedObjects.try_emplace(Cur.Ref, *NewNode);
+      continue;
+    }
+
+    // Check if the node exists already.
+    auto CurrentID = Cur.Refs.front();
+    Cur.Refs.pop_front();
+    auto Ref = CreatedObjects.find(CurrentID);
+    if (Ref != CreatedObjects.end()) {
+      // If exists already, just need to enqueue the primary node.
+      PrimaryRefStack.push_back(Ref->second);
+      continue;
+    }
+
+    // Load child.
+    auto PrimaryID = Upstream.load(CurrentID);
+    if (LLVM_UNLIKELY(!PrimaryID))
+      return PrimaryID.takeError();
+
+    enqueueNode(CurrentID, *PrimaryID);
+  }
+
+  assert(PrimaryRefStack.size() == 1);
+  return PrimaryRefStack.front();
+}
+
 std::unique_ptr<MemoryBuffer>
 ObjectProxy::getMemoryBuffer(StringRef Name,
                              bool RequiresNullTerminator) const {
diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp
new file mode 100644
index 0000000000000..7d29f4499211e
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCAS.cpp
@@ -0,0 +1,211 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class OnDiskCAS : public BuiltinCAS {
+public:
+  Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                ArrayRef<ObjectRef> Refs,
+                                ArrayRef<char> Data) final;
+
+  Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final;
+
+  CASID getID(ObjectRef Ref) const final;
+
+  std::optional<ObjectRef> getReference(const CASID &ID) const final;
+
+  Expected<bool> isMaterialized(ObjectRef Ref) const final;
+
+  ArrayRef<char> getDataConst(ObjectHandle Node) const final;
+
+  void print(raw_ostream &OS) const final;
+  Error validate(bool CheckHash) const final;
+
+  static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path);
+
+  OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+      : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {}
+
+private:
+  ObjectHandle convertHandle(ondisk::ObjectHandle Node) const {
+    return makeObjectHandle(Node.getOpaqueData());
+  }
+
+  ondisk::ObjectHandle convertHandle(ObjectHandle Node) const {
+    return ondisk::ObjectHandle(Node.getInternalRef(*this));
+  }
+
+  ObjectRef convertRef(ondisk::ObjectID Ref) const {
+    return makeObjectRef(Ref.getOpaqueData());
+  }
+
+  ondisk::ObjectID convertRef(ObjectRef Ref) const {
+    return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this));
+  }
+
+  size_t getNumRefs(ObjectHandle Node) const final {
+    auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+    return std::distance(RefsRange.begin(), RefsRange.end());
+  }
+
+  ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+    auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+    return convertRef(RefsRange.begin()[I]);
+  }
+
+  Error forEachRef(ObjectHandle Node,
+                   function_ref<Error(ObjectRef)> Callback) const final;
+
+  Error setSizeLimit(std::optional<uint64_t> SizeLimit) final;
+  Expected<std::optional<uint64_t>> getStorageSize() const final;
+  Error pruneStorageData() final;
+
+  OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB)
+      : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {}
+
+  std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB;
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB;
+  ondisk::OnDiskGraphDB *DB;
+};
+
+} // end anonymous namespace
+
+void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); }
+Error OnDiskCAS::validate(bool CheckHash) const {
+  auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data,
+                   SmallVectorImpl<uint8_t> &Result) {
+    auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject(
+        Refs, Data);
+    Result.assign(Hash.begin(), Hash.end());
+  };
+
+  if (auto E = DB->validate(CheckHash, Hasher))
+    return E;
+
+  return Error::success();
+}
+
+CASID OnDiskCAS::getID(ObjectRef Ref) const {
+  ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref));
+  return CASID::create(&getContext(), toStringRef(Hash));
+}
+
+std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const {
+  std::optional<ondisk::ObjectID> ObjID =
+      DB->getExistingReference(ID.getHash());
+  if (!ObjID)
+    return std::nullopt;
+  return convertRef(*ObjID);
+}
+
+Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const {
+  return DB->isMaterialized(convertRef(ExternalRef));
+}
+
+ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const {
+  return DB->getObjectData(convertHandle(Node));
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskCAS::loadIfExists(ObjectRef ExternalRef) {
+  Expected<std::optional<ondisk::ObjectHandle>> ObjHnd =
+      DB->load(convertRef(ExternalRef));
+  if (!ObjHnd)
+    return ObjHnd.takeError();
+  if (!*ObjHnd)
+    return std::nullopt;
+  return convertHandle(**ObjHnd);
+}
+
+Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                         ArrayRef<ObjectRef> Refs,
+                                         ArrayRef<char> Data) {
+  SmallVector<ondisk::ObjectID, 64> IDs;
+  IDs.reserve(Refs.size());
+  for (ObjectRef Ref : Refs) {
+    IDs.push_back(convertRef(Ref));
+  }
+
+  auto StoredID = DB->getReference(ComputedHash);
+  if (LLVM_UNLIKELY(!StoredID))
+    return StoredID.takeError();
+  if (Error E = DB->store(*StoredID, IDs, Data))
+    return std::move(E);
+  return convertRef(*StoredID);
+}
+
+Error OnDiskCAS::forEachRef(ObjectHandle Node,
+                            function_ref<Error(ObjectRef)> Callback) const {
+  auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+  for (ondisk::ObjectID Ref : RefsRange) {
+    if (Error E = Callback(convertRef(Ref)))
+      return E;
+  }
+  return Error::success();
+}
+
+Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+  UnifiedDB->setSizeLimit(SizeLimit);
+  return Error::success();
+}
+
+Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const {
+  return UnifiedDB->getStorageSize();
+}
+
+Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); }
+
+Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) {
+  Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB =
+      ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(),
+                                  sizeof(HashType));
+  if (!DB)
+    return DB.takeError();
+  return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB)));
+}
+
+bool cas::isOnDiskCASEnabled() {
+#if LLVM_ENABLE_ONDISK_CAS
+  return true;
+#else
+  return false;
+#endif
+}
+
+Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  // FIXME: An absolute path isn't really good enough. Should open a directory
+  // and use openat() for files underneath.
+  SmallString<256> AbsPath;
+  Path.toVector(AbsPath);
+  sys::fs::make_absolute(AbsPath);
+
+  return OnDiskCAS::open(AbsPath);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled");
+#endif /* LLVM_ENABLE_ONDISK_CAS */
+}
+
+std::unique_ptr<ObjectStore>
+cas::builtin::createObjectStoreFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+  return std::make_unique<OnDiskCAS>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 64cbe9dc8e159..245b6fb832549 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const {
 }
 
 Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
+  if (UpstreamDB) {
+    if (auto E = UpstreamDB->validate(Deep, Hasher))
+      return E;
+  }
   return Index.validate([&](FileOffset Offset,
                             OnDiskTrieRawHashMap::ConstValueProxy Record)
                             -> Error {
@@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) {
     return I.takeError();
   TrieRecord::Data Object = I->Ref.load();
 
-  if (Object.SK == TrieRecord::StorageKind::Unknown) {
-    if (!UpstreamDB)
-      return std::nullopt;
+  if (Object.SK == TrieRecord::StorageKind::Unknown)
     return faultInFromUpstream(ExternalRef);
-  }
 
   if (Object.SK == TrieRecord::StorageKind::DataPool)
     return ObjectHandle::fromFileOffset(Object.Offset);
@@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
   TrieRecord::Data Object = I->Ref.load();
   if (Object.SK != TrieRecord::StorageKind::Unknown)
     return ObjectPresence::InPrimaryDB;
+
   if (!CheckUpstream || !UpstreamDB)
     return ObjectPresence::Missing;
+
   std::optional<ObjectID> UpstreamID =
       UpstreamDB->getExistingReference(getDigest(*I));
   return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
@@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
   return std::max(IndexPercent, DataPercent);
 }
 
-Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
-    StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
-    std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+Expected<std::unique_ptr<OnDiskGraphDB>>
+OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName,
+                    unsigned HashByteSize, OnDiskGraphDB *UpstreamDB,
+                    FaultInPolicy Policy) {
   if (std::error_code EC = sys::fs::create_directories(AbsPath))
     return createFileError(AbsPath, EC);
 
@@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
                              "unexpected user header in '" + DataPoolPath +
                                  "'");
 
-  return std::unique_ptr<OnDiskGraphDB>(
-      new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
-                        std::move(UpstreamDB), Policy));
+  return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB(
+      AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy));
 }
 
 OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
                              OnDiskDataAllocator DataPool,
-                             std::unique_ptr<OnDiskGraphDB> UpstreamDB,
-                             FaultInPolicy Policy)
+                             OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy)
     : Index(std::move(Index)), DataPool(std::move(DataPool)),
-      RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
-      FIPolicy(Policy) {
+      RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) {
   /// Lifetime for "big" objects not in DataPool.
   ///
   /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
@@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
   // against the process dying during importing and leaving the database with an
   // incomplete tree. Note that if the upstream has missing nodes then the tree
   // will be copied with missing nodes as well, it won't be considered an error.
-
   struct UpstreamCursor {
     ObjectHandle Node;
     size_t RefsCount;
@@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
   // Copy the node data into the primary store.
   // FIXME: Use hard-link or cloning if the file-system supports it and data is
   // stored into a separate file.
-
   auto Data = UpstreamDB->getObjectData(UpstreamNode);
   auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
   SmallVector<ObjectID, 64> Refs;
@@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
 
 Expected<std::optional<ObjectHandle>>
 OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
-  assert(UpstreamDB);
+  if (!UpstreamDB)
+    return std::nullopt;
 
   auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
   if (LLVM_UNLIKELY(!UpstreamID))
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
index 21860717da3bf..15656cb38a5e5 100644
--- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CAS/OnDiskKeyValueDB.h"
 #include "OnDiskCommon.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Errc.h"
@@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>>
 OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
   // Check the result cache.
   OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key);
-  if (!ActionP)
+  if (ActionP) {
+    assert(isAddrAligned(Align(8), ActionP->Data.data()));
+    return ActionP->Data;
+  }
+  if (!UnifiedCache || !UnifiedCache->UpstreamKVDB)
     return std::nullopt;
-  assert(isAddrAligned(Align(8), ActionP->Data.data()));
-  return ActionP->Data;
+
+  // Try to fault in from upstream.
+  return UnifiedCache->faultInFromUpstreamKV(Key);
 }
 
 Expected<std::unique_ptr<OnDiskKeyValueDB>>
 OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
-                       StringRef ValueName, size_t ValueSize) {
+                       StringRef ValueName, size_t ValueSize,
+                       UnifiedOnDiskCache *Cache) {
   if (std::error_code EC = sys::fs::create_directories(Path))
     return createFileError(Path, EC);
 
@@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
     return std::move(E);
 
   return std::unique_ptr<OnDiskKeyValueDB>(
-      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache));
 }
 
 Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const {
+  if (UnifiedCache && UnifiedCache->UpstreamKVDB) {
+    if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue))
+      return E;
+  }
   return Cache.validate(
       [&](FileOffset Offset,
           OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error {
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
new file mode 100644
index 0000000000000..ae9d818241f4b
--- /dev/null
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -0,0 +1,613 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one
+/// directory while also restricting storage growth with a scheme of chaining
+/// the two most recent directories (primary & upstream), where the primary
+/// "faults-in" data from the upstream one. When the primary (most recent)
+/// directory exceeds its intended limit a new empty directory becomes the
+/// primary one.
+///
+/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open
+/// receives) there are directories named like this:
+///
+/// 'v<version>.<x>'
+/// 'v<version>.<x+1>'
+/// 'v<version>.<x+2>'
+/// ...
+///
+/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and
+/// the part after the dot is an increasing integer. The primary directory is
+/// the one with the highest integer and the upstream one is the directory
+/// before it. For example, if the sub-directories contained are:
+///
+/// 'v1.5', 'v1.6', 'v1.7', 'v1.8'
+///
+/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are
+/// unused directories that can be safely deleted at any time and by any
+/// process.
+///
+/// Contained within the top-level directory is a file named "lock" which is
+/// used for processes to take shared or exclusive locks for the contents of the
+/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock
+/// for the top-level directory; when it closes, if the primary sub-directory
+/// exceeded its limit, it attempts to get an exclusive lock in order to create
+/// a new empty primary directory; if it can't get the exclusive lock it gives
+/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt
+/// again.
+///
+/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a
+/// directory, by any process, the storage size in that directory will keep
+/// growing unrestricted. But the major benefit is that garbage-collection can
+/// be triggered on a directory concurrently, at any time and by any process,
+/// without affecting any active readers/writers in the same process or other
+/// processes.
+///
+/// The \c UnifiedOnDiskCache also provides validation and recovery on top of
+/// the underlying on-disk storage. The low-level storage is designed to remain
+/// coherent across regular process crashes, but may be invalid after power loss
+/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows
+/// validating the contents once per boot and can recover by marking invalid
+/// data for garbage collection.
+///
+/// The data recovery described above requires exclusive access to the CAS, and
+/// it is an error to attempt recovery if the CAS is open in any process/thread.
+/// In order to maximize backwards compatibility with tools that do not perform
+/// validation before opening the CAS, we do not attempt to get exclusive access
+/// until recovery is actually performed, meaning as long as the data is valid
+/// it will not conflict with concurrent use.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "BuiltinCAS.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+#if __has_include(<sys/sysctl.h>)
+#include <sys/sysctl.h>
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out
+/// how to handle the leftover sub-directories of the previous version, within
+/// the \p UnifiedOnDiskCache::collectGarbage function.
+static constexpr StringLiteral DBDirPrefix = "v1.";
+
+static constexpr StringLiteral ValidationFilename = "v1.validation";
+static constexpr StringLiteral CorruptPrefix = "corrupt.";
+
+ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) {
+  // little endian encoded.
+  assert(Value.size() == sizeof(uint64_t));
+  return ObjectID::fromOpaqueData(support::endian::read64le(Value.data()));
+}
+
+UnifiedOnDiskCache::ValueBytes
+UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) {
+  // little endian encoded.
+  UnifiedOnDiskCache::ValueBytes ValBytes;
+  static_assert(ValBytes.size() == sizeof(ID.getOpaqueData()));
+  support::endian::write64le(ValBytes.data(), ID.getOpaqueData());
+  return ValBytes;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) {
+  assert(UpstreamGraphDB);
+  assert(UpstreamKVDB);
+
+  std::optional<ArrayRef<char>> UpstreamValue;
+  if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue))
+    return std::move(E);
+  if (!UpstreamValue)
+    return std::nullopt;
+
+  // The value is the \p ObjectID in the context of the upstream
+  // \p OnDiskGraphDB instance. Translate it to the context of the primary
+  // \p OnDiskGraphDB instance.
+  ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue);
+  auto PrimaryID =
+      PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID));
+  if (LLVM_UNLIKELY(!PrimaryID))
+    return PrimaryID.takeError();
+  return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID));
+}
+
+/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with
+/// ascending order of the integer after the dot. Corrupt directories, if
+/// included, will come first.
+static Expected<SmallVector<std::string, 4>>
+getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) {
+  struct DBDir {
+    uint64_t Order;
+    std::string Name;
+  };
+  SmallVector<DBDir> FoundDBDirs;
+
+  std::error_code EC;
+  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+       DirI.increment(EC)) {
+    if (DirI->type() != sys::fs::file_type::directory_file)
+      continue;
+    StringRef SubDir = sys::path::filename(DirI->path());
+    if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) {
+      FoundDBDirs.push_back({0, std::string(SubDir)});
+      continue;
+    }
+    if (!SubDir.starts_with(DBDirPrefix))
+      continue;
+    uint64_t Order;
+    if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order))
+      return createStringError(inconvertibleErrorCode(),
+                               "unexpected directory " + DirI->path());
+    FoundDBDirs.push_back({Order, std::string(SubDir)});
+  }
+  if (EC)
+    return createFileError(Path, EC);
+
+  llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool {
+    return LHS.Order <= RHS.Order;
+  });
+
+  SmallVector<std::string, 4> DBDirs;
+  for (DBDir &Dir : FoundDBDirs)
+    DBDirs.push_back(std::move(Dir.Name));
+  return DBDirs;
+}
+
+static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) {
+  auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true);
+  if (!DBDirs)
+    return DBDirs.takeError();
+
+  // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure
+  // out how to handle the leftover sub-directories of the previous version.
+
+  for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) {
+    StringRef Back(DBDirs->back());
+    if (Back.starts_with(CorruptPrefix))
+      break;
+    DBDirs->pop_back();
+  }
+  return *DBDirs;
+}
+
+/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the
+/// 'v<version>.<x+1>' name.
+static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) {
+  assert(DBDir.starts_with(DBDirPrefix));
+  uint64_t Count;
+  bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count);
+  assert(!Failed);
+  (void)Failed;
+  OS << DBDirPrefix << Count + 1;
+}
+
+static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath,
+                                  bool CheckHash) {
+  SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"};
+  if (CheckHash)
+    Args.push_back("-check-hash");
+
+  llvm::SmallString<128> StdErrPath;
+  int StdErrFD = -1;
+  if (std::error_code EC = sys::fs::createTemporaryFile(
+          "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath,
+          llvm::sys::fs::OF_Text))
+    return createStringError(EC, "failed to create temporary file");
+  FileRemover OutputRemover(StdErrPath.c_str());
+
+  std::optional<llvm::StringRef> Redirects[] = {
+      {""}, // stdin = /dev/null
+      {""}, // stdout = /dev/null
+      StdErrPath.str(),
+  };
+
+  std::string ErrMsg;
+  int Result =
+      sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects,
+                          /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg);
+
+  if (Result == -1)
+    return createStringError("failed to exec " + join(Args, " ") + ": " +
+                             ErrMsg);
+  if (Result != 0) {
+    llvm::SmallString<64> Err("cas contents invalid");
+    if (!ErrMsg.empty()) {
+      Err += ": ";
+      Err += ErrMsg;
+    }
+    auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str());
+    if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) {
+      Err += ": ";
+      Err += (*StdErrBuf)->getBuffer();
+    }
+    return createStringError(Err);
+  }
+  return Error::success();
+}
+
+static Error validateInProcess(StringRef RootPath, StringRef HashName,
+                               unsigned HashByteSize, bool CheckHash) {
+  std::shared_ptr<UnifiedOnDiskCache> UniDB;
+  if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName,
+                                         HashByteSize)
+                    .moveInto(UniDB))
+    return E;
+  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+  if (Error E = CAS->validate(CheckHash))
+    return E;
+  auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB);
+  if (Error E = Cache->validate())
+    return E;
+  return Error::success();
+}
+
+static Expected<uint64_t> getBootTime() {
+#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME)
+  struct timeval TV;
+  size_t TVLen = sizeof(TV);
+  int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME};
+  if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0)
+    return createStringError(llvm::errnoAsErrorCode(),
+                             "failed to get boottime");
+  if (TVLen != sizeof(TV))
+    return createStringError("sysctl kern.boottime unexpected format");
+  return TV.tv_sec;
+#elif defined(__linux__)
+  // Use the mtime for /proc, which is recreated during system boot.
+  // We could also read /proc/stat and search for 'btime'.
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status("/proc", Status))
+    return createFileError("/proc", EC);
+  return Status.getLastModificationTime().time_since_epoch().count();
+#else
+  llvm::report_fatal_error("getBootTime unimplemented");
+#endif
+}
+
+Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded(
+    StringRef RootPath, StringRef HashName, unsigned HashByteSize,
+    bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinaryPath) {
+  if (std::error_code EC = sys::fs::create_directories(RootPath))
+    return createFileError(RootPath, EC);
+
+  SmallString<256> PathBuf(RootPath);
+  sys::path::append(PathBuf, ValidationFilename);
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForReadWrite(
+          PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+    return createFileError(PathBuf, EC);
+  assert(FD != -1);
+
+  sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+  auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); });
+
+  if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive))
+    return createFileError(PathBuf, EC);
+  auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); });
+
+  SmallString<8> Bytes;
+  if (Error E = sys::fs::readNativeFileToEOF(File, Bytes))
+    return createFileError(PathBuf, std::move(E));
+
+  uint64_t ValidationBootTime = 0;
+  if (!Bytes.empty() &&
+      StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime))
+    return createFileError(PathBuf, errc::illegal_byte_sequence,
+                           "expected integer");
+
+  static uint64_t BootTime = 0;
+  if (BootTime == 0)
+    if (Error E = getBootTime().moveInto(BootTime))
+      return std::move(E);
+
+  std::string LogValidationError;
+
+  if (ValidationBootTime == BootTime && !ForceValidation)
+    return ValidationResult::Skipped;
+
+  // Validate!
+  bool NeedsRecovery = false;
+  if (Error E =
+          LLVMCasBinaryPath
+              ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash)
+              : validateInProcess(RootPath, HashName, HashByteSize,
+                                  CheckHash)) {
+    if (AllowRecovery) {
+      consumeError(std::move(E));
+      NeedsRecovery = true;
+    } else {
+      return std::move(E);
+    }
+  }
+
+  if (NeedsRecovery) {
+    sys::path::remove_filename(PathBuf);
+    sys::path::append(PathBuf, "lock");
+
+    int LockFD = -1;
+    if (std::error_code EC = sys::fs::openFileForReadWrite(
+            PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+      return createFileError(PathBuf, EC);
+    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+    auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); });
+    if (std::error_code EC = tryLockFileThreadSafe(LockFD)) {
+      if (EC == std::errc::no_lock_available)
+        return createFileError(
+            PathBuf, EC,
+            "CAS validation requires exclusive access but CAS was in use");
+      return createFileError(PathBuf, EC);
+    }
+    auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+    auto DBDirs = getAllDBDirs(RootPath);
+    if (!DBDirs)
+      return DBDirs.takeError();
+
+    for (StringRef DBDir : *DBDirs) {
+      sys::path::remove_filename(PathBuf);
+      sys::path::append(PathBuf, DBDir);
+      std::error_code EC;
+      int Attempt = 0, MaxAttempts = 100;
+      SmallString<128> GCPath;
+      for (; Attempt < MaxAttempts; ++Attempt) {
+        GCPath.assign(RootPath);
+        sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) +
+                                      "." + DBDir);
+        EC = sys::fs::rename(PathBuf, GCPath);
+        // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST.
+        if (EC != errc::directory_not_empty && EC != errc::file_exists)
+          break;
+      }
+      if (Attempt == MaxAttempts)
+        return createStringError(
+            EC, "rename " + PathBuf +
+                    " failed: too many CAS directories awaiting pruning");
+      if (EC)
+        return createStringError(EC, "rename " + PathBuf + " to " + GCPath +
+                                         " failed: " + EC.message());
+    }
+  }
+
+  if (ValidationBootTime != BootTime) {
+    // Fix filename in case we have error to report.
+    sys::path::remove_filename(PathBuf);
+    sys::path::append(PathBuf, ValidationFilename);
+    if (std::error_code EC = sys::fs::resize_file(FD, 0))
+      return createFileError(PathBuf, EC);
+    raw_fd_ostream OS(FD, /*shouldClose=*/false);
+    OS.seek(0); // resize does not reset position
+    OS << BootTime << '\n';
+    if (OS.has_error())
+      return createFileError(PathBuf, OS.error());
+  }
+
+  return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid;
+}
+
+Expected<std::unique_ptr<UnifiedOnDiskCache>>
+UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit,
+                         StringRef HashName, unsigned HashByteSize,
+                         OnDiskGraphDB::FaultInPolicy FaultInPolicy) {
+  if (std::error_code EC = sys::fs::create_directories(RootPath))
+    return createFileError(RootPath, EC);
+
+  SmallString<256> PathBuf(RootPath);
+  sys::path::append(PathBuf, "lock");
+  int LockFD = -1;
+  if (std::error_code EC = sys::fs::openFileForReadWrite(
+          PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+    return createFileError(PathBuf, EC);
+  assert(LockFD != -1);
+  // Locking the directory using shared lock, which will prevent other processes
+  // from creating a new chain (essentially while a \p UnifiedOnDiskCache
+  // instance holds a shared lock the storage for the primary directory will
+  // grow unrestricted).
+  if (std::error_code EC =
+          lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared))
+    return createFileError(PathBuf, EC);
+
+  auto DBDirs = getAllDBDirs(RootPath);
+  if (!DBDirs)
+    return DBDirs.takeError();
+  if (DBDirs->empty())
+    DBDirs->push_back((Twine(DBDirPrefix) + "1").str());
+
+  assert(!DBDirs->empty());
+
+  /// If there is only one directory open databases on it. If there are 2 or
+  /// more directories, get the most recent directories and chain them, with the
+  /// most recent being the primary one. The remaining directories are unused
+  /// data than can be garbage-collected.
+  auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache());
+  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+  if (DBDirs->size() > 1) {
+    StringRef UpstreamDir = *(DBDirs->end() - 2);
+    PathBuf = RootPath;
+    sys::path::append(PathBuf, UpstreamDir);
+    if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+                                      /*UpstreamDB=*/nullptr, FaultInPolicy)
+                      .moveInto(UpstreamGraphDB))
+      return std::move(E);
+    if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+                                         /*ValueName=*/"objectid",
+                                         /*ValueSize=*/sizeof(uint64_t))
+                      .moveInto(UpstreamKVDB))
+      return std::move(E);
+  }
+
+  StringRef PrimaryDir = *(DBDirs->end() - 1);
+  PathBuf = RootPath;
+  sys::path::append(PathBuf, PrimaryDir);
+  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+  if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+                                    UpstreamGraphDB.get(), FaultInPolicy)
+                    .moveInto(PrimaryGraphDB))
+    return std::move(E);
+  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+  // \p UnifiedOnDiskCache does manual chaining for key-value requests,
+  // including an extra translation step of the value during fault-in.
+  if (Error E =
+          OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+                                 /*ValueName=*/"objectid",
+                                 /*ValueSize=*/sizeof(uint64_t), UniDB.get())
+              .moveInto(PrimaryKVDB))
+    return std::move(E);
+
+  UniDB->RootPath = RootPath;
+  UniDB->SizeLimit = SizeLimit.value_or(0);
+  UniDB->LockFD = LockFD;
+  UniDB->NeedsGarbageCollection = DBDirs->size() > 2;
+  UniDB->PrimaryDBDir = PrimaryDir;
+  UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB);
+  UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB);
+  UniDB->UpstreamKVDB = std::move(UpstreamKVDB);
+  UniDB->PrimaryKVDB = std::move(PrimaryKVDB);
+
+  return std::move(UniDB);
+}
+
+void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+  this->SizeLimit = SizeLimit.value_or(0);
+}
+
+uint64_t UnifiedOnDiskCache::getStorageSize() const {
+  uint64_t TotalSize = getPrimaryStorageSize();
+  if (UpstreamGraphDB)
+    TotalSize += UpstreamGraphDB->getStorageSize();
+  if (UpstreamKVDB)
+    TotalSize += UpstreamKVDB->getStorageSize();
+  return TotalSize;
+}
+
+uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const {
+  return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize();
+}
+
+bool UnifiedOnDiskCache::hasExceededSizeLimit() const {
+  uint64_t CurSizeLimit = SizeLimit;
+  if (!CurSizeLimit)
+    return false;
+
+  // If the hard limit is beyond 85%, declare above limit and request clean up.
+  unsigned CurrentPercent =
+      std::max(PrimaryGraphDB->getHardStorageLimitUtilization(),
+               PrimaryKVDB->getHardStorageLimitUtilization());
+  if (CurrentPercent > 85)
+    return true;
+
+  // We allow each of the directories in the chain to reach up to half the
+  // intended size limit. Check whether the primary directory has exceeded half
+  // the limit or not, in order to decide whether we need to start a new chain.
+  //
+  // We could check the size limit against the sum of sizes of both the primary
+  // and upstream directories but then if the upstream is significantly larger
+  // than the intended limit, it would trigger a new chain to be created before
+  // the primary has reached its own limit. Essentially in such situation we
+  // prefer reclaiming the storage later in order to have more consistent cache
+  // hits behavior.
+  return (CurSizeLimit / 2) < getPrimaryStorageSize();
+}
+
+Error UnifiedOnDiskCache::close(bool CheckSizeLimit) {
+  if (LockFD == -1)
+    return Error::success(); // already closed.
+  auto CloseLock = make_scope_exit([&]() {
+    assert(LockFD >= 0);
+    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+    sys::fs::closeFile(LockFile);
+    LockFD = -1;
+  });
+
+  bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false;
+  UpstreamKVDB.reset();
+  PrimaryKVDB.reset();
+  UpstreamGraphDB.reset();
+  PrimaryGraphDB.reset();
+  if (std::error_code EC = unlockFileThreadSafe(LockFD))
+    return createFileError(RootPath, EC);
+
+  if (!ExceededSizeLimit)
+    return Error::success();
+
+  // The primary directory exceeded its intended size limit. Try to get an
+  // exclusive lock in order to create a new primary directory for next time
+  // this \p UnifiedOnDiskCache path is opened.
+
+  if (std::error_code EC = tryLockFileThreadSafe(
+          LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) {
+    if (EC == errc::no_lock_available)
+      return Error::success(); // couldn't get exclusive lock, give up.
+    return createFileError(RootPath, EC);
+  }
+  auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+  // Managed to get an exclusive lock which means there are no other open
+  // \p UnifiedOnDiskCache instances for the same path, so we can safely start a
+  // new primary directory. To start a new primary directory we just have to
+  // create a new empty directory with the next consecutive index; since this is
+  // an atomic operation we will leave the top-level directory in a consistent
+  // state even if the process dies during this code-path.
+
+  SmallString<256> PathBuf(RootPath);
+  raw_svector_ostream OS(PathBuf);
+  OS << sys::path::get_separator();
+  getNextDBDirName(PrimaryDBDir, OS);
+  if (std::error_code EC = sys::fs::create_directory(PathBuf))
+    return createFileError(PathBuf, EC);
+
+  NeedsGarbageCollection = true;
+  return Error::success();
+}
+
+UnifiedOnDiskCache::UnifiedOnDiskCache() = default;
+
+UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); }
+
+Error UnifiedOnDiskCache::collectGarbage(StringRef Path) {
+  auto DBDirs = getAllGarbageDirs(Path);
+  if (!DBDirs)
+    return DBDirs.takeError();
+
+  SmallString<256> PathBuf(Path);
+  for (StringRef UnusedSubDir : *DBDirs) {
+    sys::path::append(PathBuf, UnusedSubDir);
+    if (std::error_code EC = sys::fs::remove_directories(PathBuf))
+      return createFileError(PathBuf, EC);
+    sys::path::remove_filename(PathBuf);
+  }
+  return Error::success();
+}
+
+Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); }
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
index db67e30ca203b..692da230b6e09 100644
--- a/llvm/unittests/CAS/ActionCacheTest.cpp
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 using namespace llvm::cas;
 
 TEST_P(CASTest, ActionCacheHit) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID;
@@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) {
 }
 
 TEST_P(CASTest, ActionCacheMiss) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID1, ID2;
@@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) {
 }
 
 TEST_P(CASTest, ActionCacheRewrite) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID1, ID2;
diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
new file mode 100644
index 0000000000000..19522e9372d85
--- /dev/null
+++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "CASTestConfig.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) {
+  unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true);
+
+  auto WithCAS = [&](llvm::function_ref<void(ObjectStore &)> Action) {
+    std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>> DBs;
+    ASSERT_THAT_ERROR(
+        createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs),
+        Succeeded());
+    ObjectStore &CAS = *DBs.first;
+    ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded());
+    Action(CAS);
+  };
+
+  std::optional<CASID> ID;
+
+  // Create an object in the CAS.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref;
+    ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded());
+    ASSERT_TRUE(Ref.has_value());
+
+    ID = CAS.getID(*Ref);
+  });
+
+  // Check materialization and prune the storage.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+    ASSERT_TRUE(Ref.has_value());
+
+    std::optional<bool> IsMaterialized;
+    ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+                      Succeeded());
+    ASSERT_TRUE(IsMaterialized);
+
+    ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded());
+  });
+
+  // Verify that the previous materialization check kept the object in the CAS.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+    ASSERT_TRUE(Ref.has_value());
+
+    std::optional<bool> IsMaterialized;
+    ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+                      Succeeded());
+    ASSERT_TRUE(IsMaterialized);
+  });
+}
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 10e4b689e151e..08cbf1daf727d 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -8,6 +8,7 @@
 
 #include "CASTestConfig.h"
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <mutex>
 
@@ -15,7 +16,8 @@ using namespace llvm;
 using namespace llvm::cas;
 
 static CASTestingEnv createInMemory(int I) {
-  return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()};
+  return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(),
+                       std::nullopt};
 }
 
 INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
@@ -23,7 +25,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
 
 #if LLVM_ENABLE_ONDISK_CAS
 namespace llvm::cas::ondisk {
-extern void setMaxMappingSize(uint64_t Size);
+void setMaxMappingSize(uint64_t Size);
 } // namespace llvm::cas::ondisk
 
 void setMaxOnDiskCASMappingSize() {
@@ -31,6 +33,17 @@ void setMaxOnDiskCASMappingSize() {
   std::call_once(
       Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); });
 }
+
+static CASTestingEnv createOnDisk(int I) {
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  std::unique_ptr<ObjectStore> CAS;
+  EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+  std::unique_ptr<ActionCache> Cache;
+  EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache),
+                    Succeeded());
+  return CASTestingEnv{std::move(CAS), std::move(Cache), std::move(Temp)};
+}
+INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
 #else
 void setMaxOnDiskCASMappingSize() {}
 #endif /* LLVM_ENABLE_ONDISK_CAS */
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index 8d3c55305f1b3..b1c0e59ff2b92 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -6,16 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
+#define LLVM_UNITTESTS_CASTESTCONFIG_H
+
 #include "llvm/CAS/ActionCache.h"
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
+#include <memory>
 
-#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
-#define LLVM_UNITTESTS_CASTESTCONFIG_H
+namespace llvm::unittest::cas {
+class MockEnv {
+  void anchor();
+
+public:
+  virtual ~MockEnv();
+};
+} // namespace llvm::unittest::cas
 
 struct CASTestingEnv {
   std::unique_ptr<llvm::cas::ObjectStore> CAS;
   std::unique_ptr<llvm::cas::ActionCache> Cache;
+  std::optional<llvm::unittest::TempDir> Temp;
 };
 
 void setMaxOnDiskCASMappingSize();
@@ -24,26 +36,47 @@ void setMaxOnDiskCASMappingSize();
 class OnDiskCASTest : public ::testing::Test {
 protected:
   void SetUp() override {
+#if !LLVM_ENABLE_ONDISK_CAS
+    GTEST_SKIP() << "OnDiskCAS is not enabled";
+#endif
     // Use a smaller database size for testing to conserve disk space.
     setMaxOnDiskCASMappingSize();
   }
 };
 
+// Parametered test fixture for ObjectStore and ActionCache tests.
 class CASTest
     : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
 protected:
   std::optional<int> NextCASIndex;
 
+  llvm::SmallVector<llvm::unittest::TempDir> Dirs;
+
+  llvm::SmallVector<std::unique_ptr<llvm::unittest::cas::MockEnv>> Envs;
+
   std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
     auto TD = GetParam()(++(*NextCASIndex));
+    if (TD.Temp)
+      Dirs.push_back(std::move(*TD.Temp));
     return std::move(TD.CAS);
   }
   std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
     auto TD = GetParam()(++(*NextCASIndex));
+    if (TD.Temp)
+      Dirs.push_back(std::move(*TD.Temp));
     return std::move(TD.Cache);
   }
-  void SetUp() override { NextCASIndex = 0; }
-  void TearDown() override { NextCASIndex = std::nullopt; }
+
+  void SetUp() override {
+    NextCASIndex = 0;
+    setMaxOnDiskCASMappingSize();
+  }
+
+  void TearDown() override {
+    NextCASIndex = std::nullopt;
+    Dirs.clear();
+    Envs.clear();
+  }
 };
 
 #endif
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index da469f7fccb5a..91e49be770745 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(ONDISK_CAS_TEST_SOURCES
+  BuiltinUnifiedCASDatabasesTest.cpp
   OnDiskGraphDBTest.cpp
   OnDiskDataAllocatorTest.cpp
   OnDiskKeyValueDBTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
+  UnifiedOnDiskCacheTest.cpp
   )
 
 set(LLVM_OPTIONAL_SOURCES
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index 54083fdb408f6..b43ae33d74127 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStoreTest.cpp ------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)",
 
   // Run validation on all CASIDs.
   for (int I = 0, E = IDs.size(); I != E; ++I)
-    ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded());
+    ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded());
 
   // Check that the blobs can be retrieved multiple times.
   for (int I = 0, E = IDs.size(); I != E; ++I) {
@@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) {
     std::optional<CASID> ID2;
     ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded());
     ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
     ASSERT_EQ(ID1, ID2);
 
     String1.append(String2);
     ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded());
     ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
     ASSERT_EQ(ID1, ID2);
     String2.append(String1);
   }
@@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)",
 
     // Check basic printing of IDs.
     IDs.push_back(CAS1->getID(*Node));
-    auto ID = CAS1->getID(Nodes.back());
-    EXPECT_EQ(ID.toString(), IDs.back().toString());
-    EXPECT_EQ(*Node, Nodes.back());
-    EXPECT_EQ(ID, IDs.back());
+    EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+    EXPECT_EQ(Nodes.front(), Nodes.front());
+    EXPECT_EQ(Nodes.back(), Nodes.back());
+    EXPECT_EQ(IDs.front(), IDs.front());
+    EXPECT_EQ(IDs.back(), IDs.back());
     if (Nodes.size() <= 1)
       continue;
     EXPECT_NE(Nodes.front(), Nodes.back());
@@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) {
   }
 
   for (auto ID : CreatedNodes)
-    ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded());
 }
 
 #if LLVM_ENABLE_THREADS
@@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
 }
 
 TEST_P(CASTest, BlobsParallel) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   uint64_t Size = 1ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
 
 #ifdef EXPENSIVE_CHECKS
 TEST_P(CASTest, BlobsBigParallel) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   // 100k is large enough to be standalone files in our on-disk cas.
   uint64_t Size = 100ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
 #endif // EXPENSIVE_CHECKS
+
+#ifndef _WIN32 // create_link won't work for directories on Windows
+TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) {
+  // This test intentionally uses symlinked paths to the same CAS to subvert the
+  // shared memory mappings that would normally be created within a single
+  // process. This breaks the lock file guarantees, so we must be careful not
+  // to create or destroy the CAS objects concurrently, which is when the locks
+  // are normally important.
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+            std::error_code());
+
+  std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+                    Succeeded());
+
+  uint64_t Size = 1ULL * 1024;
+  ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+
+TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) {
+  // See comment in BlobsParallelMultiCAS.
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+            std::error_code());
+
+  std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+                    Succeeded());
+
+  // 100k is large enough to be standalone files in our on-disk cas.
+  uint64_t Size = 100ULL * 1024;
+  ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+#endif // _WIN32
 #endif // LLVM_ENABLE_THREADS
+
+TEST_F(OnDiskCASTest, OnDiskCASDiskSize) {
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  std::unique_ptr<ObjectStore> CAS;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+
+  uint64_t MaxSize = 100 * 1024 * 1024;
+
+  // Check that we map the files to the correct size.
+  auto CheckFileSizes = [&](bool Mapped) {
+    bool FoundIndex = false, FoundData = false;
+    std::error_code EC;
+    for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC;
+         I.increment(EC)) {
+      StringRef Filename = sys::path::filename(I->path());
+      if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) {
+        FoundIndex = true;
+        ASSERT_TRUE(I->status());
+        if (Mapped)
+          EXPECT_EQ(I->status()->getSize(), MaxSize);
+        else
+          EXPECT_LT(I->status()->getSize(), MaxSize);
+      }
+      if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) {
+        FoundData = true;
+        ASSERT_TRUE(I->status());
+        if (Mapped)
+          EXPECT_EQ(I->status()->getSize(), MaxSize);
+        else
+          EXPECT_LT(I->status()->getSize(), MaxSize);
+      }
+    }
+    ASSERT_TRUE(FoundIndex);
+    ASSERT_TRUE(FoundData);
+  };
+
+  // Check that we have the full mapping size when the CAS is open.
+  CheckFileSizes(/*Mapped=*/true);
+  CAS.reset();
+  // Check that the CAS is shrunk to a smaller size.
+  CheckFileSizes(/*Mapped=*/false);
+
+  // Repeat the checks when starting from an existing CAS.
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+  CheckFileSizes(/*Mapped=*/true);
+  CAS.reset();
+  CheckFileSizes(/*Mapped=*/false);
+}
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
index 89f93e08366c9..48a1830f9b219 100644
--- a/llvm/unittests/CAS/OnDiskCommonUtils.h
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -12,6 +12,8 @@
 
 #include "llvm/CAS/BuiltinObjectHasher.h"
 #include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/BLAKE3.h"
 #include "llvm/Testing/Support/Error.h"
 
@@ -58,6 +60,25 @@ inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
   return ID;
 }
 
+inline Expected<ObjectID> cachePut(OnDiskKeyValueDB &DB, ArrayRef<uint8_t> Key,
+                                   ObjectID ID) {
+  auto Value = UnifiedOnDiskCache::getValueFromObjectID(ID);
+  auto Result = DB.put(Key, Value);
+  if (!Result)
+    return Result.takeError();
+  return UnifiedOnDiskCache::getObjectIDFromValue(*Result);
+}
+
+inline Expected<std::optional<ObjectID>> cacheGet(OnDiskKeyValueDB &DB,
+                                                  ArrayRef<uint8_t> Key) {
+  auto Result = DB.get(Key);
+  if (!Result)
+    return Result.takeError();
+  if (!*Result)
+    return std::nullopt;
+  return UnifiedOnDiskCache::getObjectIDFromValue(**Result);
+}
+
 inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
                        unsigned Indent = 0) {
   std::optional<ondisk::ObjectHandle> Obj;
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
index 3c2e96318a5ed..e9c73bfb6c8d3 100644
--- a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -102,7 +102,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) {
   std::unique_ptr<OnDiskGraphDB> DB;
   ASSERT_THAT_ERROR(
       OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
-                          std::move(UpstreamDB),
+                          UpstreamDB.get(),
                           OnDiskGraphDB::FaultInPolicy::SingleNode)
           .moveInto(DB),
       Succeeded());
@@ -208,7 +208,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) {
   unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
   std::unique_ptr<OnDiskGraphDB> DB;
   ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
-                                        std::move(UpstreamDB),
+                                        UpstreamDB.get(),
                                         OnDiskGraphDB::FaultInPolicy::FullTree)
                         .moveInto(DB),
                     Succeeded());
@@ -264,14 +264,14 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) {
     unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
     std::unique_ptr<OnDiskGraphDB> DB;
     ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
-                                          sizeof(HashType),
-                                          std::move(UpstreamDB), Policy1)
+                                          sizeof(HashType), UpstreamDB.get(),
+                                          Policy1)
                           .moveInto(DB),
                       Succeeded());
     DB.reset();
     ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
-                                          sizeof(HashType),
-                                          std::move(UpstreamDB), Policy2)
+                                          sizeof(HashType), UpstreamDB.get(),
+                                          Policy2)
                           .moveInto(DB),
                       Failed());
   };
diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
new file mode 100644
index 0000000000000..09aebc2d4bc19
--- /dev/null
+++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
@@ -0,0 +1,198 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+/// Visits all the files of a directory recursively and returns the sum of their
+/// sizes.
+static Expected<size_t> countFileSizes(StringRef Path) {
+  size_t TotalSize = 0;
+  std::error_code EC;
+  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+       DirI.increment(EC)) {
+    if (DirI->type() == sys::fs::file_type::directory_file) {
+      Expected<size_t> Subsize = countFileSizes(DirI->path());
+      if (!Subsize)
+        return Subsize.takeError();
+      TotalSize += *Subsize;
+      continue;
+    }
+    ErrorOr<sys::fs::basic_file_status> Stat = DirI->status();
+    if (!Stat)
+      return createFileError(DirI->path(), Stat.getError());
+    TotalSize += Stat->getSize();
+  }
+  if (EC)
+    return createFileError(Path, EC);
+  return TotalSize;
+}
+
+TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) {
+  unittest::TempDir Temp("ondisk-unified", /*Unique=*/true);
+  std::unique_ptr<UnifiedOnDiskCache> UniDB;
+
+  const uint64_t SizeLimit = 1024ull * 64;
+  auto reopenDB = [&]() {
+    UniDB.reset();
+    ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3",
+                                               sizeof(HashType))
+                          .moveInto(UniDB),
+                      Succeeded());
+  };
+
+  reopenDB();
+
+  HashType RootHash;
+  HashType OtherHash;
+  HashType Key1Hash;
+  HashType Key2Hash;
+  {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded());
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+                      Succeeded());
+    ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot);
+    ASSERT_EQ(Digest.size(), RootHash.size());
+    llvm::copy(Digest, RootHash.data());
+
+    std::optional<ObjectID> IDOther;
+    ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded());
+    Digest = DB.getDigest(*IDOther);
+    ASSERT_EQ(Digest.size(), OtherHash.size());
+    llvm::copy(Digest, OtherHash.data());
+
+    Key1Hash = digest("key1");
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(
+        cachePut(UniDB->getKeyValueDB(), Key1Hash, *IDRoot).moveInto(Val),
+        Succeeded());
+    EXPECT_EQ(IDRoot, Val);
+
+    Key2Hash = digest("key2");
+    std::optional<ObjectID> KeyID;
+    ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded());
+    ASSERT_THAT_ERROR(cachePut(UniDB->getKeyValueDB(),
+                               UniDB->getGraphDB().getDigest(*KeyID), *ID1)
+                          .moveInto(Val),
+                      Succeeded());
+  }
+
+  auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+    std::string PrintedTree;
+    raw_string_ostream OS(PrintedTree);
+    ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded());
+    EXPECT_EQ(PrintedTree, ExpectedTree);
+  };
+  auto checkRootTree = [&]() {
+    return checkTree(RootHash, "root\n  1\n  2\n");
+  };
+
+  auto checkKey = [&](const HashType &Key, StringRef ExpectedData) {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key).moveInto(Val),
+                      Succeeded());
+
+    ASSERT_TRUE(Val.has_value());
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded());
+    EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData);
+  };
+
+  checkRootTree();
+  checkTree(OtherHash, "other\n");
+  checkKey(Key1Hash, "root");
+  checkKey(Key2Hash, "1");
+
+  auto storeBigObject = [&](unsigned Index) {
+    SmallString<1000> Buf;
+    Buf.append(970, 'a');
+    raw_svector_ostream(Buf) << Index;
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID),
+                      Succeeded());
+  };
+
+  uint64_t PrevStoreSize = UniDB->getStorageSize();
+  unsigned Index = 0;
+  while (!UniDB->hasExceededSizeLimit()) {
+    storeBigObject(Index++);
+  }
+  EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize);
+  UniDB->setSizeLimit(SizeLimit * 2);
+  EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+  UniDB->setSizeLimit(SizeLimit);
+  EXPECT_TRUE(UniDB->hasExceededSizeLimit());
+
+  reopenDB();
+
+  EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+  EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+  checkRootTree();
+  checkKey(Key1Hash, "root");
+
+  while (!UniDB->hasExceededSizeLimit()) {
+    storeBigObject(Index++);
+  }
+  PrevStoreSize = UniDB->getStorageSize();
+  ASSERT_THAT_ERROR(UniDB->close(), Succeeded());
+  EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+  reopenDB();
+  EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+  std::optional<size_t> DirSizeBefore;
+  ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore),
+                    Succeeded());
+
+  ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()),
+                    Succeeded());
+
+  std::optional<size_t> DirSizeAfter;
+  ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter),
+                    Succeeded());
+  EXPECT_LT(*DirSizeAfter, *DirSizeBefore);
+
+  reopenDB();
+  EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+  checkRootTree();
+  checkKey(Key1Hash, "root");
+
+  EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize);
+
+  // 'Other' tree and 'Key2' got garbage-collected.
+  {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded());
+    EXPECT_FALSE(DB.containsObject(*ID));
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key2Hash).moveInto(Val),
+                      Succeeded());
+    EXPECT_FALSE(Val.has_value());
+  }
+}

From 10349ca139068f6589f47369e1d48d06aeb66ad0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 3 Nov 2025 17:50:55 +0000
Subject: [PATCH 046/313] [gn build] Port 6747ea050dfc

---
 llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn       | 3 +++
 llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index 5590b27ac3784..1e0e918d3370c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -4,14 +4,17 @@ static_library("CAS") {
     "ActionCache.cpp",
     "ActionCaches.cpp",
     "BuiltinCAS.cpp",
+    "BuiltinUnifiedCASDatabases.cpp",
     "DatabaseFile.cpp",
     "InMemoryCAS.cpp",
     "MappedFileRegionArena.cpp",
     "ObjectStore.cpp",
+    "OnDiskCAS.cpp",
     "OnDiskCommon.cpp",
     "OnDiskDataAllocator.cpp",
     "OnDiskGraphDB.cpp",
     "OnDiskKeyValueDB.cpp",
     "OnDiskTrieRawHashMap.cpp",
+    "UnifiedOnDiskCache.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index 2d9eb6814c376..b10e0e6706cc3 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -9,8 +9,10 @@ unittest("CASTests") {
   ]
   sources = [
     "ActionCacheTest.cpp",
+    "BuiltinUnifiedCASDatabasesTest.cpp",
     "CASTestConfig.cpp",
     "ObjectStoreTest.cpp",
+    "UnifiedOnDiskCacheTest.cpp",
   ]
 
   if (llvm_enable_ondisk_cas) {

From 2de5a17e9c8e427c75565574e8bdb5b62c35fec6 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 3 Nov 2025 19:08:27 +0100
Subject: [PATCH 047/313] [CIR] Upstream FPToFP Builtin CeilOp (#166052)

Upstream the FPToFP Builtin CeilOp
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td        | 10 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp             | 11 +++++++++++
 clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  8 ++++++++
 clang/test/CIR/CodeGen/builtins-floating-point.c    | 11 +++++++++--
 4 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2b361ed0982c6..dc56db1bbd4ea 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4171,6 +4171,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> {
   }];
 }
 
+def CIR_CeilOp : CIR_UnaryFPToFPBuiltinOp<"ceil", "FCeilOp"> {
+  let summary = "Computes the ceiling of the specified value";
+  let description = [{
+    `cir.ceil` computes the ceiling of a given value and returns a result
+    of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
   let summary = "Computes the floating-point cosine value";
   let description = [{
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index e35100ffe4b6b..d9b9e3b877b50 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -211,6 +211,17 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     assert(!cir::MissingFeatures::fastMathFlags());
     return emitUnaryMaybeConstrainedFPBuiltin<cir::CosOp>(*this, *e);
 
+  case Builtin::BIceil:
+  case Builtin::BIceilf:
+  case Builtin::BIceill:
+  case Builtin::BI__builtin_ceil:
+  case Builtin::BI__builtin_ceilf:
+  case Builtin::BI__builtin_ceilf16:
+  case Builtin::BI__builtin_ceill:
+  case Builtin::BI__builtin_ceilf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e);
+
   case Builtin::BIfabs:
   case Builtin::BIfabsf:
   case Builtin::BIfabsl:
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 5a6193fa8d840..d94108294a9a3 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1336,6 +1336,14 @@ mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMCeilOpLowering::matchAndRewrite(
+    cir::CeilOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::FCeilOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
     cir::AllocaOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c
index 193cc172d37d2..8bdc43c59dc6f 100644
--- a/clang/test/CIR/CodeGen/builtins-floating-point.c
+++ b/clang/test/CIR/CodeGen/builtins-floating-point.c
@@ -7,14 +7,21 @@
 
 float cosf(float f) {
   return __builtin_cosf(f);
-  // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float
+  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.float
   // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
   // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
 }
 
 double cos(double f) {
   return __builtin_cos(f);
-  // CIR: {{.+}} = cir.cos {{.+}} : !cir.double
+  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.double
   // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
   // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
 }
+
+float ceil(float f) {
+  return __builtin_ceilf(f);
+  // CIR: %{{.*}} = cir.ceil %{{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
+}

From d65e712e30a8998c897a6454e4eaea4f974bf765 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Mon, 3 Nov 2025 10:09:18 -0800
Subject: [PATCH 048/313] [clang] Make "__GCC_HAVE_DWARF2_CFI_ASM" a proper
 predefined macro (#165731)

Use a flag to determine whether this macro should be set when
intializing the preprocessor.

This macro was added to the driver in
9d117e7b2a399a9b2bcf53fb9b9c0946e82dc75c because it can be conditionally
disabled, but before that, the flag to gate behavior was removed under
the assumption it wasn't conditional in
b5b622a03c5136fa10d245dbe1f8f278ebd98d1b. This patch is to connect the
macro with the preexisting flag
---
 clang-tools-extra/test/pp-trace/pp-trace-include.cpp | 1 -
 clang-tools-extra/test/pp-trace/pp-trace-macro.cpp   | 1 -
 clang/include/clang/Basic/DebugOptions.def           | 2 ++
 clang/include/clang/Driver/Options.td                | 8 ++++++--
 clang/lib/Driver/ToolChains/Clang.cpp                | 9 ++++++---
 clang/lib/Frontend/InitPreprocessor.cpp              | 3 +++
 clang/test/DebugInfo/KeyInstructions/flag.cpp        | 3 +++
 clang/test/Preprocessor/unwind-tables.c              | 2 ++
 8 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
index ea9896e1cfde2..fccbd9b3740bd 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
@@ -39,7 +39,6 @@
 // CHECK-NEXT:   Reason: EnterFile
 // CHECK-NEXT:   FileType: C_User
 // CHECK-NEXT:   PrevFID: (invalid)
-// CHECK:      - Callback: MacroDefined
 // CHECK:      - Callback: FileChanged
 // CHECK-NEXT:   Loc: "<built-in>:1:1"
 // CHECK-NEXT:   Reason: ExitFile
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
index 7c2a231101070..5bd38e0dade28 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
@@ -40,7 +40,6 @@ X
 // CHECK-NEXT:   MacroNameTok: __STDC_EMBED_EMPTY__
 // CHECK-NEXT:   MacroDirective: MD_Define
 // CHECK:      - Callback: MacroDefined
-// CHECK:      - Callback: MacroDefined
 // CHECK-NEXT:   MacroNameTok: MACRO
 // CHECK-NEXT:   MacroDirective: MD_Define
 // CHECK-NEXT: - Callback: MacroExpands
diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
index a768b12fa4e0d..ea3636ffa1af1 100644
--- a/clang/include/clang/Basic/DebugOptions.def
+++ b/clang/include/clang/Basic/DebugOptions.def
@@ -46,6 +46,8 @@ ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2,
 DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm
                                              ///< is enabled.
 
+DEBUGOPT(Dwarf2CFIAsm, 1, 0, NotCompatible) ///< Set when -fdwarf2-cfi-asm is enabled.
+
 DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain
                                            ///< inline line tables.
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4778b87b789a9..5cf332783cbc3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2155,8 +2155,12 @@ defm dollars_in_identifiers : BoolFOption<"dollars-in-identifiers",
   PosFlag<SetTrue, [], [ClangOption], "Allow">,
   NegFlag<SetFalse, [], [ClangOption], "Disallow">,
   BothFlags<[], [ClangOption, CC1Option], " '$' in identifiers">>;
-def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
-def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
+
+defm dwarf2_cfi_asm
+    : BoolFOption<"dwarf2-cfi-asm", CodeGenOpts<"Dwarf2CFIAsm">, DefaultFalse,
+                  PosFlag<SetTrue, [], [ClangOption, CC1Option]>,
+                  NegFlag<SetFalse>>;
+
 defm dwarf_directory_asm : BoolFOption<"dwarf-directory-asm",
   CodeGenOpts<"NoDwarfDirectoryAsm">, DefaultFalse,
   NegFlag<SetTrue, [], [ClangOption, CC1Option]>,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d3ab6f1261ad6..30d3e5293a31b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7879,10 +7879,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                        !TC.getTriple().isAndroid() && TC.useIntegratedAs()))
     CmdArgs.push_back("-faddrsig");
 
-  if ((Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
+  const bool HasDefaultDwarf2CFIASM =
+      (Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
       (EH || UnwindTables || AsyncUnwindTables ||
-       DebugInfoKind != llvm::codegenoptions::NoDebugInfo))
-    CmdArgs.push_back("-D__GCC_HAVE_DWARF2_CFI_ASM=1");
+       DebugInfoKind != llvm::codegenoptions::NoDebugInfo);
+  if (Args.hasFlag(options::OPT_fdwarf2_cfi_asm,
+                   options::OPT_fno_dwarf2_cfi_asm, HasDefaultDwarf2CFIASM))
+    CmdArgs.push_back("-fdwarf2-cfi-asm");
 
   if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) {
     std::string Str = A->getAsString(Args);
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index ed3f1f93d25d3..b88d9f89c5f71 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1516,6 +1516,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.PointerAuthIntrinsics)
     Builder.defineMacro("__PTRAUTH__");
 
+  if (CGOpts.Dwarf2CFIAsm)
+    Builder.defineMacro("__GCC_HAVE_DWARF2_CFI_ASM");
+
   // Get other target #defines.
   TI.getTargetDefines(LangOpts, Builder);
 }
diff --git a/clang/test/DebugInfo/KeyInstructions/flag.cpp b/clang/test/DebugInfo/KeyInstructions/flag.cpp
index 6aeeed664135e..4a4a5c4c142a7 100644
--- a/clang/test/DebugInfo/KeyInstructions/flag.cpp
+++ b/clang/test/DebugInfo/KeyInstructions/flag.cpp
@@ -8,6 +8,9 @@
 
 // KEY-INSTRUCTIONS: "-gkey-instructions"
 // NO-KEY-INSTRUCTIONS-NOT: key-instructions
+
+// Only expect one dwarf related flag.
+// NO-DEBUG: -fdwarf2-cfi-asm
 // NO-DEBUG-NOT: debug-info-kind
 // NO-DEBUG-NOT: dwarf
 
diff --git a/clang/test/Preprocessor/unwind-tables.c b/clang/test/Preprocessor/unwind-tables.c
index 0a863d79adbf6..5ff990d0c40a6 100644
--- a/clang/test/Preprocessor/unwind-tables.c
+++ b/clang/test/Preprocessor/unwind-tables.c
@@ -1,11 +1,13 @@
 // RUN: %clang %s -dM -E -target x86_64-windows | FileCheck %s --check-prefix=NO
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables | FileCheck %s --check-prefix=NO
+// RUN: %clang %s -dM -E -target x86_64 -fno-dwarf2-cfi-asm | FileCheck %s --check-prefix=NO
 
 // RUN: %clang %s -dM -E -target x86_64 | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -funwind-tables -fno-asynchronous-unwind-tables -g | FileCheck %s
 // RUN: %clang %s -dM -E -target aarch64-apple-darwin | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -g | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -fexceptions | FileCheck %s
+// RUN: %clang %s -dM -E -target x86_64-windows -fdwarf2-cfi-asm | FileCheck %s
 
 // NO-NOT: #define __GCC_HAVE_DWARF2_CFI_ASM
 // CHECK: #define __GCC_HAVE_DWARF2_CFI_ASM 1

From 84a9ed25e816a51df765770e8867bccbde8da8e9 Mon Sep 17 00:00:00 2001
From: Marco Maia <marcogmaia@gmail.com>
Date: Mon, 3 Nov 2025 15:15:32 -0300
Subject: [PATCH 049/313] [clangd] Preserve qualified names in "override pure
 virtual methods" tweak (#163726)

Prevents the tweak from splitting **qualified names** (e.g.,
`foo::Type`) by incorrectly inserting a space around the scope
resolution (`::`).

**Before:**

```cpp
// input:
virtual foo::Type::func() = 0
// output:
foo :: Type :: func()
```

**After:**

```cpp
// input:
virtual foo::Type::func() = 0
// output:
foo::Type::func()
```
---
 .../refactor/tweaks/OverridePureVirtuals.cpp  |  5 ++-
 .../tweaks/OverridePureVirtualsTests.cpp      | 39 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
index 16febeca70809..b557066d979f5 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
@@ -79,7 +79,7 @@
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/Type.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -116,7 +116,8 @@ std::string removePureVirtualSyntax(const std::string &MethodDecl,
 
     DeclString += Tk.text();
     if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma &&
-        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren)
+        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren &&
+        Tk.Kind != tok::coloncolon && Next.Kind != tok::coloncolon)
       DeclString += ' ';
   }
   // Trim the last whitespace.
diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
index b7dcbee1650ec..72095ab2f5982 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
@@ -715,6 +715,45 @@ class D : public B {
   EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
 }
 
+TEST_F(OverridePureVirtualsTests, QualifiedNames) {
+  constexpr auto Before = R"cpp(
+namespace foo { struct S{}; namespace bar { struct S2{}; } }
+
+class B {
+public:
+  virtual foo::S foo(int var = 0) = 0;
+  virtual foo::bar::S2 bar(int var = 0) = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+namespace foo { struct S{}; namespace bar { struct S2{}; } }
+
+class B {
+public:
+  virtual foo::S foo(int var = 0) = 0;
+  virtual foo::bar::S2 bar(int var = 0) = 0;
+};
+
+class D : public B {
+public:
+  foo::S foo(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `foo` is not implemented.");
+  }
+
+  foo::bar::S2 bar(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `bar` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang

From 425fe3362d49ff1b2958f8ec9cae49e587b57e7e Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Mon, 3 Nov 2025 10:20:08 -0800
Subject: [PATCH 050/313] [lldb] Fix unaligned writes in ObjectFileELF
 (#165759)

The code to apply relocations was sometimes creating unaligned
destination pointers. Instead of giving them an explicit type (i.e.
`uint64_t *`) and forcing the compiler to generate unaligned stores,
mark the pointer as `void *`. The compiler will figure out the correct
series of store instructions.
---
 lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 49841e7307443..e06e69fb08305 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -2735,9 +2735,8 @@ static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel,
     // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
     WritableDataBuffer *data_buffer =
         llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
-    uint64_t *dst = reinterpret_cast<uint64_t *>(
-        data_buffer->GetBytes() + rel_section->GetFileOffset() +
-        ELFRelocation::RelocOffset64(rel));
+    void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() +
+                      ELFRelocation::RelocOffset64(rel);
     uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel);
     memcpy(dst, &val_offset, sizeof(uint64_t));
   }
@@ -2762,9 +2761,8 @@ static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel,
     // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
     WritableDataBuffer *data_buffer =
         llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
-    uint32_t *dst = reinterpret_cast<uint32_t *>(
-        data_buffer->GetBytes() + rel_section->GetFileOffset() +
-        ELFRelocation::RelocOffset32(rel));
+    void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() +
+                      ELFRelocation::RelocOffset32(rel);
     memcpy(dst, &truncated_addr, sizeof(uint32_t));
   }
 }

From ecdd660356323d18b23cbb7d8b9244a2e0662be4 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 3 Nov 2025 19:26:46 +0100
Subject: [PATCH 051/313] [clang] Report Diagnostic when builtin vector has
 negative size (#166055)

Report a diagnostic in case vector_size or ext_vector_type attributes
are used with a negative size. The same evaluation result can be used
for other checks, for example, the too big a size.

Issue #165463
---
 clang/docs/ReleaseNotes.rst                      |  3 +++
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  2 ++
 clang/lib/Sema/SemaType.cpp                      | 10 ++++++++++
 clang/test/SemaCXX/vector.cpp                    | 13 +++++++++++++
 4 files changed, 28 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c3740a4a027bd..cd272396252d0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -395,6 +395,9 @@ Improvements to Clang's diagnostics
   that were previously incorrectly accepted in case of other irrelevant
   conditions are now consistently diagnosed, identical to C++ mode.
 
+- Clang now emits a diagnostic in case `vector_size` or `ext_vector_type`
+  attributes are used with a negative size (#GH165463).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..fa509536bf021 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3510,6 +3510,8 @@ def err_init_method_bad_return_type : Error<
   "init methods must return an object pointer type, not %0">;
 def err_attribute_invalid_size : Error<
   "vector size not an integral multiple of component size">;
+def err_attribute_vec_negative_size
+    : Error<"vector must have non-negative size">;
 def err_attribute_zero_size : Error<"zero %0 size">;
 def err_attribute_size_too_large : Error<"%0 size too large">;
 def err_typecheck_sve_rvv_ambiguous : Error<
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 280b3c92cce14..682fd258eccf2 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2358,6 +2358,11 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
     return QualType();
   }
 
+  if (VecSize->isNegative()) {
+    Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
+    return QualType();
+  }
+
   if (CurType->isDependentType())
     return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
                                           VectorKind::Generic);
@@ -2427,6 +2432,11 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
       return QualType();
     }
 
+    if (vecSize->isNegative()) {
+      Diag(ArraySize->getExprLoc(), diag::err_attribute_vec_negative_size);
+      return QualType();
+    }
+
     if (!vecSize->isIntN(32)) {
       Diag(AttrLoc, diag::err_attribute_size_too_large)
           << ArraySize->getSourceRange() << "vector";
diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp
index 808bdb679b09c..06195f039cd92 100644
--- a/clang/test/SemaCXX/vector.cpp
+++ b/clang/test/SemaCXX/vector.cpp
@@ -786,3 +786,16 @@ const long long e = *0; // expected-error {{indirection requires pointer operand
 double f = a - e;       // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}}
 int h = c - e;          // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}}
 }
+
+typedef int v_neg_size __attribute__((vector_size(-8))); // expected-error{{vector must have non-negative size}}
+typedef int v_neg_size_2 __attribute__((vector_size(-1 * 8))); // expected-error{{vector must have non-negative size}}
+typedef int v_ext_neg_size __attribute__((ext_vector_type(-8))); // expected-error{{vector must have non-negative size}}
+typedef int v_ext_neg_size2 __attribute__((ext_vector_type(-1 * 8))); // expected-error{{vector must have non-negative size}}
+
+
+#if __cplusplus >= 201103L
+
+template <int N> using templated_v_size = int  __attribute__((vector_size(N))); // expected-error{{vector must have non-negative size}}
+templated_v_size<-8> templated_v_neg_size; //expected-note{{in instantiation of template type alias 'templated_v_size' requested here}}
+
+#endif

From fa2c5fe21df42accfcd4498aa7e6b2ae943357ea Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 3 Nov 2025 10:50:51 -0800
Subject: [PATCH 052/313] [Github] Pin Remaining Github Actions to SHAs
 (#166194)

We had a couple in the llvm/actions repository that were pinned to main.
Pin them to the latest SHA in main to keep them consistent with
everything else. These also ensures we are compliant with our own CI
best practices and also cleans up the remaining CodeQL findings for this
specific issue.
---
 .github/workflows/hlsl-test-all.yaml     | 2 +-
 .github/workflows/libclang-abi-tests.yml | 2 +-
 .github/workflows/llvm-abi-tests.yml     | 2 +-
 .github/workflows/new-issues.yml         | 2 +-
 .github/workflows/premerge.yaml          | 2 +-
 .github/workflows/release-binaries.yml   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index cdc951658b4d2..ce6ccfa23df6a 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -54,7 +54,7 @@ jobs:
           path: golden-images
       - name: Setup Windows
         if: runner.os == 'Windows'
-        uses: llvm/actions/setup-windows@main
+        uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main
         with:
           arch: amd64
       - name: Build DXC
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 5ccf976848197..432c45744abda 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -100,7 +100,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Install abi-compliance-checker
         run: |
           sudo apt-get update
diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index f73d180bb0005..961f1cc79389d 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -88,7 +88,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Install abi-compliance-checker
         run: |
           sudo apt-get update
diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
index 8480a657cc717..a5dcad28dbe24 100644
--- a/.github/workflows/new-issues.yml
+++ b/.github/workflows/new-issues.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-24.04
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: llvm/actions/issue-labeler@main
+      - uses: llvm/actions/issue-labeler@42d80571b13f4599bbefbc7189728b64723c7f78 # main
         with:
           repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           configuration-path: .github/new-issues-labeler.yml
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 6303a119750b5..973d3abf358ce 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -190,7 +190,7 @@ jobs:
         with:
           max-size: "2000M"
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Build and Test
         run: |
           source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py)
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index d1a017ab7b553..acc47231e3569 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -177,7 +177,7 @@ jobs:
 
     - name: Setup Windows
       if: startsWith(runner.os, 'Windows')
-      uses: llvm/actions/setup-windows@main
+      uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       with:
         arch: amd64
 

From d200df0557b71fae0d77bc7fc1650d22a05af371 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 3 Nov 2025 11:04:22 -0800
Subject: [PATCH 053/313] [libcxx] Remove Redundant Reset in ~basic_string
 (#164718)

8dae17be2991cd7f0d7fd9aa5aecd064520a14f6 refactors basic_string for more
code reuse. This makes sense in most cases, but has performance overhead
in the case of ~basic_string. The refactoring of ~basic_string to call
__reset_internal_buffer() added a redundant (inside the destructor)
reset of the object, which the optimizer is unable to optimize away in
many cases. This patch prevents a ~1% regression we observed on an
internal workload when applying the original refactoring. This does
slightly pessimize the code readability, but I think this change is
worth it given the performance impact.

I'm hoping to add a benchmark(s) to the upstream libc++ benchmark suite
around string construction/destruction to ensure that this case does not
regress as it seems common in real world applications. I will put up a
separate PR for that when I figure out a reasonable way to write it.
---
 libcxx/include/string | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index 33382c7af4b2c..ede42467b99fe 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -644,6 +644,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__utility/forward.h>
 #  include <__utility/is_pointer_in_range.h>
 #  include <__utility/move.h>
+#  include <__utility/no_destroy.h>
 #  include <__utility/scope_guard.h>
 #  include <__utility/swap.h>
 #  include <climits>
@@ -918,6 +919,7 @@ private:
     __rep() = default;
     _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {}
     _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__uninitialized_tag) {}
   };
 
   _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_);
@@ -1210,7 +1212,10 @@ public:
   }
 #  endif // _LIBCPP_CXX03_LANG
 
-  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(); }
+  // TODO(boomanaiden154): Once we mark this in destructors as dead on return,
+  // we can use a normal call to __reset_internal_buffer and remove the extra
+  // __rep constructor.
+  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(__rep(__uninitialized_tag())); }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT {
     return __self_view(typename __self_view::__assume_valid(), data(), size());

From 8fd1bf2f8c9e6e7c4bc5f6915a9d52bb3672601b Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Mon, 3 Nov 2025 11:11:47 -0800
Subject: [PATCH 054/313] [BPF] Remove unused weak symbol __bpf_trap (#166003)

Nikita Popov reported an issue ([1]) where a dangling weak symbol
__bpf_trap is in the final binary and this caused libbpf failing like
below:

  $ veristat -v ./t.o
  Processing 't.o'...
  libbpf: elf: skipping unrecognized data section(4) .eh_frame
  libbpf: elf: skipping relo section(5) .rel.eh_frame for section(4) .eh_frame
  libbpf: failed to find BTF for extern '__bpf_trap': -3
  Failed to open './t.o': -3

In llvm, the dag selection phase generates __bpf_trap in code. Later the
UnreachableBlockElim pass removed __bpf_trap from the code, but
__bpf_trap symbol survives in the symbol table.

Having a dangling __bpf_trap weak symbol is not good for old kernels as
seen in the above veristat failure. Although users could use compiler
flag `-mllvm -bpf-disable-trap-unreachable` to workaround the issue,
this patch fixed the issue by removing the dangling __bpf_trap.

  [1] https://github.com/llvm/llvm-project/issues/165696
---
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp | 24 ++++++++++++++++++++
 llvm/lib/Target/BPF/BPFAsmPrinter.h   |  1 +
 llvm/test/CodeGen/BPF/bpf_trap.ll     | 32 +++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 llvm/test/CodeGen/BPF/bpf_trap.ll

diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 77dc4a75a7d68..b2a82040ee823 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) {
     }
   }
 
+  for (GlobalObject &GO : M.global_objects()) {
+    if (!GO.hasExternalWeakLinkage())
+      continue;
+
+    if (!SawTrapCall && GO.getName() == BPF_TRAP) {
+      GO.eraseFromParent();
+      break;
+    }
+  }
+
   return AsmPrinter::doFinalization(M);
 }
 
@@ -160,6 +170,20 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  if (MI->isCall()) {
+    for (const MachineOperand &Op : MI->operands()) {
+      if (Op.isGlobal()) {
+        if (const GlobalValue *GV = Op.getGlobal())
+          if (GV->getName() == BPF_TRAP)
+            SawTrapCall = true;
+      } else if (Op.isSymbol()) {
+        if (const MCSymbol *Sym = Op.getMCSymbol())
+          if (Sym->getName() == BPF_TRAP)
+            SawTrapCall = true;
+      }
+    }
+  }
+
   BPF_MC::verifyInstructionPredicates(MI->getOpcode(),
                                       getSubtargetInfo().getFeatureBits());
 
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 90ef2073609a6..75a1d7ed9f884 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -39,6 +39,7 @@ class BPFAsmPrinter : public AsmPrinter {
 private:
   BTFDebug *BTF;
   TargetMachine &TM;
+  bool SawTrapCall = false;
 
   const BPFTargetMachine &getBTM() const;
 };
diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll
new file mode 100644
index 0000000000000..ab8df5ff7cb0d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/bpf_trap.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+;
+target triple = "bpf"
+
+define i32 @test(i8 %x) {
+entry:
+  %0 = and i8 %x, 3
+  switch i8 %0, label %default.unreachable4 [
+    i8 0, label %return
+    i8 1, label %sw.bb1
+    i8 2, label %sw.bb2
+    i8 3, label %sw.bb3
+  ]
+
+sw.bb1:                                           ; preds = %entry
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  br label %return
+
+default.unreachable4:                             ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1
+  %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ]
+  ret i32 %retval.0
+}
+
+; CHECK-NOT: __bpf_trap

From 0a28c07f2f51b6c94546f8e8e59c4e934ba6c520 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 3 Nov 2025 11:12:23 -0800
Subject: [PATCH 055/313] workflows/release-binaries: Disable LTO/PGO for
 testing macOS job in PRs (#165801)

When a PR is submitted the macos-14 workflow will run with LTO/PGO
disabled. This makes it possible to run the workflow on the free runners
with the six hour timeout and will allow us to test the workflow on pull
requests.
---
 .github/workflows/release-binaries.yml | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index acc47231e3569..25f426b7814df 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -138,7 +138,6 @@ jobs:
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi
 
-        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         case "${{ inputs.runs-on }}" in
           ubuntu-22.04*)
             build_runs_on="depot-${{ inputs.runs-on }}-16"
@@ -157,6 +156,23 @@ jobs:
             build_runs_on=$test_runs_on
             ;;
         esac
+
+        case "$build_runs_on" in
+          # These runners cannot build the full release package faster than
+          # the 6 hours timeout limit, so we need to use a configuration
+          # that builds more quickly.
+          macos-14)
+            bootstrap_prefix="BOOTSTRAP"
+            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF -DLLVM_RELEASE_ENABLE_PGO=OFF"
+            ;;
+          *)
+            bootstrap_prefix="BOOTSTRAP_BOOTSTRAP"
+            ;;
+        esac
+
+        target_cmake_flags="$target_cmake_flags -D${bootstrap_prefix}_CPACK_PACKAGE_FILE_NAME=$release_binary_basename"
+
+        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT
         echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT
 
@@ -200,8 +216,7 @@ jobs:
         # so we need to set some extra cmake flags to disable this.
         cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
             ${{ needs.prepare.outputs.target-cmake-flags }} \
-            -C clang/cmake/caches/Release.cmake \
-            -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
+            -C clang/cmake/caches/Release.cmake
 
     - name: Build
       shell: bash

From 3a0c534032aac828a12fd2057b00197a12aa90b2 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 3 Nov 2025 11:13:50 -0800
Subject: [PATCH 056/313] [ORC] Fix -Wunused-function warning on windows
 (#166207)

All of the users of this function are guarded by LLVM_ON_UNIX and
LLVM_ENABLE_THREADS ifdefs, so wrap the function itself in these guards
as well to avoid the unused function warning.
---
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 79216e89c7cba..88d6daf08d35e 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -776,6 +776,7 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
       SlabSize, SREPC, SAs);
 }
 
+#if LLVM_ON_UNIX && LLVM_ENABLE_THREADS
 static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
   switch (UseMemMgr) {
   case MemMgr::Default:
@@ -789,6 +790,7 @@ static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
     break;
   }
 }
+#endif
 
 static Expected<MaterializationUnit::Interface>
 getTestObjectFileInterface(Session &S, MemoryBufferRef O) {

From 0c707c9713f0657f7208b8f9a95a13af749d95c5 Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com>
Date: Mon, 3 Nov 2025 20:15:14 +0100
Subject: [PATCH 057/313] [libc] Add printf error handling (#162876)

[#159474](https://github.com/llvm/llvm-project/issues/159474)

- All printf variants set errno and consistently return -1 on error,
instead of returning various predefined error codes
- Return value overflow handling is added
---
 libc/src/stdio/CMakeLists.txt                 | 24 +++++++++
 libc/src/stdio/asprintf.cpp                   | 18 ++++++-
 libc/src/stdio/baremetal/CMakeLists.txt       |  8 +++
 libc/src/stdio/baremetal/printf.cpp           | 23 ++++++--
 libc/src/stdio/baremetal/vprintf.cpp          | 23 ++++++--
 libc/src/stdio/generic/CMakeLists.txt         |  4 ++
 libc/src/stdio/generic/fprintf.cpp            | 17 +++++-
 libc/src/stdio/generic/printf.cpp             | 17 +++++-
 libc/src/stdio/generic/vfprintf.cpp           | 17 +++++-
 libc/src/stdio/generic/vprintf.cpp            | 17 +++++-
 libc/src/stdio/printf_core/CMakeLists.txt     | 25 +++++++++
 libc/src/stdio/printf_core/core_structs.h     | 19 ++++---
 libc/src/stdio/printf_core/error_mapper.h     | 21 ++++++++
 .../stdio/printf_core/generic/CMakeLists.txt  |  8 +++
 .../stdio/printf_core/generic/error_mapper.h  | 49 +++++++++++++++++
 .../stdio/printf_core/linux/CMakeLists.txt    |  8 +++
 .../stdio/printf_core/linux/error_mapper.h    | 54 +++++++++++++++++++
 libc/src/stdio/printf_core/printf_main.h      |  9 ++--
 .../stdio/printf_core/vasprintf_internal.h    | 20 +++----
 .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++++++-----
 .../stdio/printf_core/write_int_converter.h   |  4 +-
 libc/src/stdio/printf_core/writer.h           |  8 +--
 libc/src/stdio/snprintf.cpp                   | 19 ++++++-
 libc/src/stdio/sprintf.cpp                    | 18 ++++++-
 libc/src/stdio/vasprintf.cpp                  | 16 +++++-
 libc/src/stdio/vsnprintf.cpp                  | 19 ++++++-
 libc/src/stdio/vsprintf.cpp                   | 17 +++++-
 libc/src/stdlib/CMakeLists.txt                |  6 +++
 libc/src/stdlib/strfromd.cpp                  | 10 +++-
 libc/src/stdlib/strfromf.cpp                  | 10 +++-
 libc/src/stdlib/strfroml.cpp                  | 10 +++-
 libc/src/time/strftime_core/strftime_main.h   |  3 +-
 libc/test/src/stdio/CMakeLists.txt            |  2 +
 libc/test/src/stdio/fprintf_test.cpp          | 23 ++++++++
 .../src/stdio/printf_core/converter_test.cpp  | 30 +++++------
 .../src/stdio/printf_core/writer_test.cpp     | 32 +++++------
 libc/test/src/stdio/snprintf_test.cpp         | 15 ++++++
 libc/test/src/stdio/vfprintf_test.cpp         |  5 ++
 libc/test/src/stdlib/StrfromTest.h            | 19 ++++++-
 39 files changed, 584 insertions(+), 104 deletions(-)
 create mode 100644 libc/src/stdio/printf_core/error_mapper.h
 create mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt
 create mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h
 create mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt
 create mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h

diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index b0a6ef1e291b5..c75c8b11be2b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -125,6 +125,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -136,6 +140,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -146,6 +154,10 @@ add_entrypoint_object(
     asprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -157,6 +169,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -168,6 +184,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -178,6 +198,10 @@ add_entrypoint_object(
     vasprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_subdirectory(printf_core)
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index f8cfb74ce48ea..083f40c1f19fa 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -7,8 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/asprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -22,8 +26,18 @@ LLVM_LIBC_FUNCTION(int, asprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret = printf_core::vasprintf_internal(buffer, format, args);
-  return ret;
+  auto ret_val = printf_core::vasprintf_internal(buffer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
index 548938f885c94..bfeff0e2b5880 100644
--- a/libc/src/stdio/baremetal/CMakeLists.txt
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -29,8 +29,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -87,8 +91,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index 7253c6549a4e4..0c6c9ad338c9f 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/printf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -42,13 +45,25 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  int retval = printf_core::printf_main(&writer, format, args);
+  auto retval = printf_core::printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(retval.error());
+    return -1;
+  }
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK)
-    retval = flushval;
+  if (flushval != printf_core::WRITE_OK) {
+    libc_errno = printf_core::internal_error_to_errno(-flushval);
+    return -1;
+  }
 
-  return retval;
+  if (retval.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(retval.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index ab02533f14911..d2f586c70ad1c 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -40,13 +43,25 @@ LLVM_LIBC_FUNCTION(int, vprintf,
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  int retval = printf_core::printf_main(&writer, format, args);
+  auto retval = printf_core::printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(retval.error());
+    return -1;
+  }
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK)
-    retval = flushval;
+  if (flushval != printf_core::WRITE_OK) {
+    libc_errno = printf_core::internal_error_to_errno(-flushval);
+    return -1;
+  }
 
-  return retval;
+  if (retval.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(retval.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 6361822b61999..71055edea3d9e 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -393,7 +393,11 @@ add_generic_entrypoint_object(
 list(APPEND fprintf_deps
       libc.hdr.types.FILE
       libc.src.__support.arg_list
+      libc.src.__support.CPP.limits
+      libc.src.__support.libc_errno
       libc.src.stdio.printf_core.vfprintf_internal
+      libc.src.stdio.printf_core.core_structs
+      libc.src.stdio.printf_core.error_mapper
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
index 087aeadfc52c5..b07f2528fe11d 100644
--- a/libc/src/stdio/generic/fprintf.cpp
+++ b/libc/src/stdio/generic/fprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -27,8 +30,18 @@ LLVM_LIBC_FUNCTION(int, fprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp
index bb7c7c86f843f..d6d4adcefb3b1 100644
--- a/libc/src/stdio/generic/printf.cpp
+++ b/libc/src/stdio/generic/printf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/printf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -31,9 +34,19 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(
+  auto ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  return ret_val;
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
index 01f4265f118a6..c00352d1dd666 100644
--- a/libc/src/stdio/generic/vfprintf.cpp
+++ b/libc/src/stdio/generic/vfprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -24,8 +27,18 @@ LLVM_LIBC_FUNCTION(int, vfprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp
index 08d71515646ed..1c0837fd5d441 100644
--- a/libc/src/stdio/generic/vprintf.cpp
+++ b/libc/src/stdio/generic/vprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/vprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -29,9 +32,19 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  int ret_val = printf_core::vfprintf_internal(
+  auto ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  return ret_val;
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index ee66145e60156..2d1daea71406e 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -32,6 +32,17 @@ if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
 
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+else()
+  add_subdirectory(generic)
+endif()
+
+set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper)
+if(NOT TARGET ${target_error_converter})
+    set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper)
+endif()
+
 add_header_library(
   printf_config
   HDRS
@@ -47,6 +58,7 @@ add_header_library(
     libc.include.inttypes
     libc.src.__support.CPP.string_view
     libc.src.__support.FPUtil.fp_bits
+    libc.hdr.errno_macros
 )
 
 add_header_library(
@@ -125,6 +137,7 @@ add_header_library(
     .writer
     .core_structs
     libc.src.__support.arg_list
+    libc.src.__support.error_or
 )
 
 add_header_library(
@@ -136,10 +149,20 @@ add_header_library(
     libc.hdr.func.free
     libc.hdr.func.realloc
     libc.src.__support.arg_list
+    libc.src.__support.error_or
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
 )
 
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    ${target_error_mapper}
+    libc.src.__support.macros.properties.architectures
+)
+
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
   # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
@@ -152,8 +175,10 @@ add_header_library(
     vfprintf_internal.h
   DEPENDS
     libc.src.__support.File.file
+    libc.src.__support.error_or
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
   ${use_system_file}
 )
+
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index e27f77b6b594a..0d41f2244d8da 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -132,14 +132,17 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
 
 // This is the value to be returned by conversions when no error has occurred.
 constexpr int WRITE_OK = 0;
-// These are the printf return values for when an error has occurred. They are
-// all negative, and should be distinct.
-constexpr int FILE_WRITE_ERROR = -1;
-constexpr int FILE_STATUS_ERROR = -2;
-constexpr int NULLPTR_WRITE_ERROR = -3;
-constexpr int INT_CONVERSION_ERROR = -4;
-constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
-constexpr int ALLOCATION_ERROR = -6;
+// These are the error return values used by the printf engine when an
+// error has occurred. They are all large negative, distinct values starting
+// from -1000 to not overlap with system errors.
+constexpr int FILE_WRITE_ERROR = -1001;
+constexpr int FILE_STATUS_ERROR = -1002;
+constexpr int NULLPTR_WRITE_ERROR = -1003;
+constexpr int INT_CONVERSION_ERROR = -1004;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -1005;
+constexpr int ALLOCATION_ERROR = -1006;
+constexpr int OVERFLOW_ERROR = -1007;
+
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h
new file mode 100644
index 0000000000000..23030930133a1
--- /dev/null
+++ b/libc/src/stdio/printf_core/error_mapper.h
@@ -0,0 +1,21 @@
+//===-- Error mapper for printf ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+// Maps internal errors to the available errnos on the platform.
+#if defined(__linux__)
+#include "linux/error_mapper.h"
+#else
+#include "generic/error_mapper.h"
+#endif
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt
new file mode 100644
index 0000000000000..2f0143d992e31
--- /dev/null
+++ b/libc/src/stdio/printf_core/generic/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    libc.src.stdio.printf_core.core_structs
+    libc.hdr.errno_macros
+)
diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h
new file mode 100644
index 0000000000000..d8cdd2cc2dbaa
--- /dev/null
+++ b/libc/src/stdio/printf_core/generic/error_mapper.h
@@ -0,0 +1,49 @@
+//===-- Generic implementation of error mapper ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
+
+#include "hdr/errno_macros.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace printf_core {
+
+LIBC_INLINE static int internal_error_to_errno(int internal_error) {
+  // System error occured, return error as is.
+  if (internal_error < 1001 && internal_error > 0) {
+    return internal_error;
+  }
+
+  // Map internal error to the available C standard errnos.
+  switch (-internal_error) {
+  case WRITE_OK:
+    return 0;
+  case FILE_WRITE_ERROR:
+  case FILE_STATUS_ERROR:
+  case NULLPTR_WRITE_ERROR:
+  case ALLOCATION_ERROR:
+    return EDOM;
+  case INT_CONVERSION_ERROR:
+  case FIXED_POINT_CONVERSION_ERROR:
+  case OVERFLOW_ERROR:
+    return ERANGE;
+  default:
+    LIBC_ASSERT(
+        false &&
+        "Invalid internal printf error code passed to internal_error_to_errno");
+    return EDOM;
+  }
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..2f0143d992e31
--- /dev/null
+++ b/libc/src/stdio/printf_core/linux/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    libc.src.stdio.printf_core.core_structs
+    libc.hdr.errno_macros
+)
diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h
new file mode 100644
index 0000000000000..3c2fe663072d0
--- /dev/null
+++ b/libc/src/stdio/printf_core/linux/error_mapper.h
@@ -0,0 +1,54 @@
+//===-- Linux implementation of error mapper --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
+
+#include "hdr/errno_macros.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace printf_core {
+
+LIBC_INLINE static int internal_error_to_errno(int internal_error) {
+  // System error occured, return error as is.
+  if (internal_error < 1001 && internal_error > 0) {
+    return internal_error;
+  }
+
+  // Map internal error to POSIX errnos.
+  switch (-internal_error) {
+  case WRITE_OK:
+    return 0;
+  case FILE_WRITE_ERROR:
+    return EIO;
+  case FILE_STATUS_ERROR:
+    return EIO;
+  case NULLPTR_WRITE_ERROR:
+    return EINVAL;
+  case INT_CONVERSION_ERROR:
+    return ERANGE;
+  case FIXED_POINT_CONVERSION_ERROR:
+    return EINVAL;
+  case ALLOCATION_ERROR:
+    return ENOMEM;
+  case OVERFLOW_ERROR:
+    return EOVERFLOW;
+  default:
+    LIBC_ASSERT(
+        false &&
+        "Invalid internal printf error code passed to internal_error_to_errno");
+    return EINVAL;
+  }
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h
index 57f29858d5298..1c7a7237c097d 100644
--- a/libc/src/stdio/printf_core/printf_main.h
+++ b/libc/src/stdio/printf_core/printf_main.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H
 
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -22,8 +23,9 @@ namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 template <WriteMode write_mode>
-int printf_main(Writer<write_mode> *writer, const char *__restrict str,
-                internal::ArgList &args) {
+ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
+                            const char *__restrict str,
+                            internal::ArgList &args) {
   Parser<internal::ArgList> parser(str, args);
   int result = 0;
   for (FormatSection cur_section = parser.get_next_section();
@@ -33,9 +35,8 @@ int printf_main(Writer<write_mode> *writer, const char *__restrict str,
       result = convert(writer, cur_section);
     else
       result = writer->write(cur_section.raw_string);
-
     if (result < 0)
-      return result;
+      return Error(-result);
   }
 
   return writer->get_chars_written();
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
index 283d8df2810fb..41df17b67f35b 100644
--- a/libc/src/stdio/printf_core/vasprintf_internal.h
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -10,6 +10,7 @@
 #include "hdr/func/malloc.h"
 #include "hdr/func/realloc.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
@@ -29,7 +30,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
   if (new_buff == nullptr) {
     if (wb->buff != wb->init_buff)
       free(wb->buff);
-    return printf_core::ALLOCATION_ERROR;
+    return ALLOCATION_ERROR;
   }
   if (isBuffOnStack)
     inline_memcpy(new_buff, wb->buff, wb->buff_cur);
@@ -42,27 +43,28 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
 
 constexpr size_t DEFAULT_BUFFER_SIZE = 200;
 
-LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format,
-                                   internal::ArgList args) {
+LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret,
+                                               const char *__restrict format,
+                                               internal::ArgList args) {
   char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb(
       init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook);
   printf_core::Writer writer(wb);
 
   auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (ret_val < 0) {
+  if (!ret_val.has_value()) {
     *ret = nullptr;
-    return -1;
+    return ret_val;
   }
   if (wb.buff == init_buff_on_stack) {
-    *ret = static_cast<char *>(malloc(ret_val + 1));
+    *ret = static_cast<char *>(malloc(ret_val.value() + 1));
     if (ret == nullptr)
-      return printf_core::ALLOCATION_ERROR;
-    inline_memcpy(*ret, wb.buff, ret_val);
+      return Error(ALLOCATION_ERROR);
+    inline_memcpy(*ret, wb.buff, ret_val.value());
   } else {
     *ret = wb.buff;
   }
-  (*ret)[ret_val] = '\0';
+  (*ret)[ret_val.value()] = '\0';
   return ret_val;
 }
 } // namespace printf_core
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index 630de9d9d43dd..564441d3bf51a 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/attributes.h" // For LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -35,8 +36,8 @@ LIBC_INLINE void funlockfile(FILE *f) {
   reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock();
 }
 
-LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
-                                   FILE *f) {
+LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                         size_t nmemb, FILE *f) {
   return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked(
       ptr, size * nmemb);
 }
@@ -47,9 +48,11 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); }
 
 LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); }
 
-LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
-                                   ::FILE *f) {
-  return ::fwrite_unlocked(ptr, size, nmemb, f);
+LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                         size_t nmemb, ::FILE *f) {
+  // Need to use system errno in this case, as system write will set this errno
+  // which we need to propagate back into our code.
+  return {::fwrite_unlocked(ptr, size, nmemb, f), errno};
 }
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace internal
@@ -60,26 +63,38 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) {
   ::FILE *target_file = reinterpret_cast<::FILE *>(fp);
   // Write new_str to the target file. The logic preventing a zero-length write
   // is in the writer, so we don't check here.
-  size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char),
-                                             new_str.size(), target_file);
-  if (written != new_str.size() || internal::ferror_unlocked(target_file))
+  auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char),
+                                                new_str.size(), target_file);
+  // Propagate actual system error in FileIOResult.
+  if (write_result.has_error())
+    return -write_result.error;
+
+  // In case short write occured or error was not set on FileIOResult for some
+  // reason.
+  if (write_result.value != new_str.size() ||
+      internal::ferror_unlocked(target_file))
     return FILE_WRITE_ERROR;
+
   return WRITE_OK;
 }
 
-LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
-                                  const char *__restrict format,
-                                  internal::ArgList &args) {
+LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream,
+                                              const char *__restrict format,
+                                              internal::ArgList &args) {
   constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb(
       buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream));
   Writer writer(wb);
   internal::flockfile(stream);
-  int retval = printf_main(&writer, format, args);
+  auto retval = printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    internal::funlockfile(stream);
+    return retval;
+  }
   int flushval = wb.overflow_write("");
   if (flushval != WRITE_OK)
-    retval = flushval;
+    retval = Error(-flushval);
   internal::funlockfile(stream);
   return retval;
 }
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index efcff278bd284..04b2bef05bc7b 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer,
     return NULLPTR_WRITE_ERROR;
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
-  int written = writer->get_chars_written();
+  size_t written = writer->get_chars_written();
 
   switch (to_conv.length_modifier) {
   case LengthModifier::none:
-    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written;
+    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written);
     break;
   case LengthModifier::l:
     *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written;
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 1d4734a51b9b8..9de108ece510f 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer {
 
 template <WriteMode write_mode> class Writer final {
   WriteBuffer<write_mode> &wb;
-  int chars_written = 0;
+  size_t chars_written = 0;
 
   LIBC_INLINE int pad(char new_char, size_t length) {
     // First, fill as much of the buffer as possible with the padding char.
@@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final {
   // Takes a string, copies it into the buffer if there is space, else passes it
   // to the overflow mechanism to be handled separately.
   LIBC_INLINE int write(cpp::string_view new_string) {
-    chars_written += static_cast<int>(new_string.size());
+    chars_written += new_string.size();
     if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) {
       inline_memcpy(wb.buff + wb.buff_cur, new_string.data(),
                     new_string.size());
@@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final {
   // if there is space, else calls pad which will loop and call the overflow
   // mechanism on a secondary buffer.
   LIBC_INLINE int write(char new_char, size_t length) {
-    chars_written += static_cast<int>(length);
+    chars_written += length;
 
     if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) {
       inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char),
@@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final {
     return wb.overflow_write(char_string_view);
   }
 
-  LIBC_INLINE int get_chars_written() { return chars_written; }
+  LIBC_INLINE size_t get_chars_written() { return chars_written; }
 };
 
 // Class-template auto deduction helpers.
diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp
index c8940862f711f..206c50d1b41a4 100644
--- a/libc/src/stdio/snprintf.cpp
+++ b/libc/src/stdio/snprintf.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/snprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -32,10 +36,21 @@ LLVM_LIBC_FUNCTION(int, snprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp
index 7be97d3591aaf..9e9ecdfdf38cc 100644
--- a/libc/src/stdio/sprintf.cpp
+++ b/libc/src/stdio/sprintf.cpp
@@ -10,7 +10,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,9 +36,20 @@ LLVM_LIBC_FUNCTION(int, sprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
index 4a44d4a0f8842..4bc6a5992d5c8 100644
--- a/libc/src/stdio/vasprintf.cpp
+++ b/libc/src/stdio/vasprintf.cpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vasprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -18,7 +22,17 @@ LLVM_LIBC_FUNCTION(int, vasprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  return printf_core::vasprintf_internal(ret, format, args);
+  auto ret_val = printf_core::vasprintf_internal(ret, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp
index b07a2499a0dd3..ba3568fbe78b3 100644
--- a/libc/src/stdio/vsnprintf.cpp
+++ b/libc/src/stdio/vsnprintf.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/vsnprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -29,10 +33,21 @@ LLVM_LIBC_FUNCTION(int, vsnprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp
index 26d497be42125..65c223a442d4b 100644
--- a/libc/src/stdio/vsprintf.cpp
+++ b/libc/src/stdio/vsprintf.cpp
@@ -10,7 +10,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -30,9 +33,19 @@ LLVM_LIBC_FUNCTION(int, vsprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index c464f82dcbda7..1ccdcc8bec148 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -73,6 +73,8 @@ add_entrypoint_object(
     strfromf.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -83,6 +85,8 @@ add_entrypoint_object(
     strfromd.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -93,6 +97,8 @@ add_entrypoint_object(
     strfroml.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_header_library(
diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp
index f51e6d4c7f1df..f970e22010201 100644
--- a/libc/src/stdlib/strfromd.cpp
+++ b/libc/src/stdlib/strfromd.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromd.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,7 +39,12 @@ LLVM_LIBC_FUNCTION(int, strfromd,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp
index 14dbfdb25bab6..55ede003134b5 100644
--- a/libc/src/stdlib/strfromf.cpp
+++ b/libc/src/stdlib/strfromf.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,7 +39,12 @@ LLVM_LIBC_FUNCTION(int, strfromf,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp
index 12f22a8a2fb65..37d18738902bc 100644
--- a/libc/src/stdlib/strfroml.cpp
+++ b/libc/src/stdlib/strfroml.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfroml.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -41,7 +44,12 @@ LLVM_LIBC_FUNCTION(int, strfroml,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h
index c7e590627094a..2b136d83234cd 100644
--- a/libc/src/time/strftime_core/strftime_main.h
+++ b/libc/src/time/strftime_core/strftime_main.h
@@ -36,7 +36,8 @@ int strftime_main(printf_core::Writer<write_mode> *writer,
       return result;
   }
 
-  return writer->get_chars_written();
+  // TODO: Use ErrorOr<size_t>
+  return static_cast<int>(writer->get_chars_written());
 }
 
 } // namespace strftime_core
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index eec108bc12ca5..d71f1dff11943 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -186,6 +186,8 @@ add_libc_test(
     fprintf_test.cpp
   DEPENDS
     libc.src.stdio.fprintf
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
     ${fprintf_test_deps}
   COMPILE_OPTIONS
     ${use_system_file}
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 6799323cc6ad9..7d36bd30854b8 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -15,6 +15,9 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "src/__support/CPP/limits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -31,6 +34,8 @@ using ::fread;
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace printf_test
 
+using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -78,6 +83,24 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written =
       LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
+  ASSERT_ERRNO_EQ(EBADF);
+
+  ASSERT_EQ(printf_test::fclose(file), 0);
+}
+
+#ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
+TEST(LlvmLibcFPrintfTest, NullPtrCheck) {
+  const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test");
+  auto FILE_PATH = libc_make_test_file_path(FILENAME);
+
+  ::FILE *file = printf_test::fopen(FILE_PATH, "w");
+  ASSERT_FALSE(file == nullptr);
+
+  int ret =
+      LIBC_NAMESPACE::fprintf(file, "hello %s", static_cast<int *>(nullptr));
+  EXPECT_LT(ret, 0);
+  ASSERT_ERRNO_EQ(EINVAL);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
+#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index bf088937e4104..2dae2a22c864c 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "abc");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "%");
-  ASSERT_EQ(writer.get_chars_written(), 1);
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "D");
-  ASSERT_EQ(writer.get_chars_written(), 1);
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
@@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "   E");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "F   ");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
@@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "DEF");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
@@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "456");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
@@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "xy");
-  ASSERT_EQ(writer.get_chars_written(), 2);
+  ASSERT_EQ(writer.get_chars_written(), size_t{2});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
@@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, " 789");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
@@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "ghi ");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
@@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "12345");
-  ASSERT_EQ(writer.get_chars_written(), 5);
+  ASSERT_EQ(writer.get_chars_written(), size_t{5});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
@@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x00000000123456ab");
-  ASSERT_EQ(writer.get_chars_written(), 18);
+  ASSERT_EQ(writer.get_chars_written(), size_t{18});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
@@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "101010");
-  ASSERT_EQ(writer.get_chars_written(), 6);
+  ASSERT_EQ(writer.get_chars_written(), size_t{6});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
@@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x123456ab");
-  ASSERT_EQ(writer.get_chars_written(), 10);
+  ASSERT_EQ(writer.get_chars_written(), size_t{10});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
@@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "1234");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp
index d036341be7981..d263cf55aa474 100644
--- a/libc/test/src/stdio/printf_core/writer_test.cpp
+++ b/libc/test/src/stdio/printf_core/writer_test.cpp
@@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abc", str);
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
@@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF123", str);
-  ASSERT_EQ(writer.get_chars_written(), 9);
+  ASSERT_EQ(writer.get_chars_written(), size_t{9});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteChars) {
@@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaa", str);
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
@@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDDD111", str);
-  ASSERT_EQ(writer.get_chars_written(), 9);
+  ASSERT_EQ(writer.get_chars_written(), size_t{9});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
@@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
                "ZZZZZZZZZZ"
                "ZZZZZZZZZ",
                str);
-  ASSERT_EQ(writer.get_chars_written(), 99);
+  ASSERT_EQ(writer.get_chars_written(), size_t{99});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
@@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
@@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF1234", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
@@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("1111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), 15);
+  ASSERT_EQ(writer.get_chars_written(), size_t{15});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
@@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF1114", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
@@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
@@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
   writer.write('1', 3);
   writer.write({"456", 3});
 
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 struct OutBuff {
@@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("abcDEF123456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
@@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("111111111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), 15);
+  ASSERT_EQ(writer.get_chars_written(), size_t{15});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
@@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
@@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
@@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
   wb.overflow_write("");
   str[out_buff.cur_pos] = '\0';
 
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
   ASSERT_STREQ("aaaDEF111456", str);
 }
 
diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp
index baaa664cdc9ee..1062f952d7429 100644
--- a/libc/test/src/stdio/snprintf_test.cpp
+++ b/libc/test/src/stdio/snprintf_test.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/snprintf.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 // The sprintf test cases cover testing the shared printf functionality, so
 // these tests will focus on snprintf exclusive features.
 
@@ -59,3 +63,14 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) {
   EXPECT_EQ(written, 10);
   ASSERT_STREQ(buff, "1234567890");
 }
+
+TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) {
+  char buff[0];
+
+  // Trigger an overflow in the return value of snprintf by writing more than
+  // INT_MAX bytes.
+  int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max();
+  int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, "");
+  EXPECT_LT(written, 0);
+  ASSERT_ERRNO_EQ(EOVERFLOW);
+}
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index f50565a0f68ca..9b5f09db8fd41 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -19,6 +19,8 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -44,6 +46,8 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format,
   return ret;
 }
 
+using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 TEST(LlvmLibcVFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -90,6 +94,7 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) {
 
   written = call_vfprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
+  ASSERT_ERRNO_EQ(EBADF);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h
index e82c94499aa11..fdeed0e3c06f5 100644
--- a/libc/test/src/stdlib/StrfromTest.h
+++ b/libc/test/src/stdlib/StrfromTest.h
@@ -8,6 +8,8 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str)             \
@@ -15,7 +17,7 @@
   EXPECT_STREQ(actual_str, expected_str);
 
 template <typename InputT>
-class StrfromTest : public LIBC_NAMESPACE::testing::Test {
+class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
   static constexpr bool is_single_prec =
       LIBC_NAMESPACE::cpp::is_same<InputT, float>::value;
@@ -481,6 +483,16 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test {
     written = func(buff, 10, "%A", -ld_nan);
     ASSERT_STREQ_LEN(written, buff, "-NAN");
   }
+
+  void charsWrittenOverflow(FunctionT func) {
+    char buff[100];
+    // Trigger an overflow in the return value of strfrom by writing more than
+    // INT_MAX bytes.
+    int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f);
+
+    EXPECT_LT(result, 0);
+    ASSERT_ERRNO_EQ(EOVERFLOW);
+  }
 };
 
 #define STRFROM_TEST(InputType, name, func)                                    \
@@ -501,4 +513,7 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test {
   TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) {                       \
     insufficentBufsize(func);                                                  \
   }                                                                            \
-  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }
+  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }        \
+  TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) {                         \
+    charsWrittenOverflow(func);                                                \
+  }

From ca69a8d2f403de3617970dcfa2f84756f7f336dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Mon, 3 Nov 2025 20:19:34 +0100
Subject: [PATCH 058/313] [clang-format] Fix ColumnLimit violation while
 aligning (#165627)

It did compute the length only on the first line, and thus the following
lines could be (and in the test example were) moved over the column
limit, when the = was aligned.
---
 clang/lib/Format/WhitespaceManager.cpp | 3 ++-
 clang/unittests/Format/FormatTest.cpp  | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index f24b8ab14bdce..406c77cb3ae8f 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -591,7 +591,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
       CurrentChangeWidthRight = CurrentChange.TokenLength;
     const FormatToken *MatchingParenToEncounter = nullptr;
     for (unsigned J = I + 1;
-         J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter);
+         J != E && (Changes[J].NewlinesBefore == 0 ||
+                    MatchingParenToEncounter || Changes[J].IsAligned);
          ++J) {
       const auto &Change = Changes[J];
       const auto *Tok = Change.Tok;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index ca9e7925e5e95..24235b966399d 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -20824,6 +20824,13 @@ TEST_F(FormatTest, AlignWithLineBreaks) {
                "    argument1,\n"
                "    argument2);",
                Style);
+
+  Style.ColumnLimit = 45;
+  verifyFormat("auto xxxxxxxx = foo;\n"
+               "auto x = whatever ? some / long -\n"
+               "                        computition / stuff\n"
+               "                  : random;",
+               Style);
 }
 
 TEST_F(FormatTest, AlignWithInitializerPeriods) {

From dd45c060ffe05eca991b8ed01a71d09edeebbc94 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 3 Nov 2025 20:54:02 +0100
Subject: [PATCH 059/313] [clang][NFC] Fix BuildExtVectorType parameter name
 (#166208)

Fix the parameter name in the BuildExtVectorType function, also updating
the code style to be consistent with BuildVectorType

Discovered in #166055
---
 clang/lib/Sema/SemaType.cpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 682fd258eccf2..c483930705057 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2399,7 +2399,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
                                VectorKind::Generic);
 }
 
-QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
+QualType Sema::BuildExtVectorType(QualType T, Expr *SizeExpr,
                                   SourceLocation AttrLoc) {
   // Unlike gcc's vector_size attribute, we do not allow vectors to be defined
   // in conjunction with complex types (pointers, arrays, functions, etc.).
@@ -2422,40 +2422,40 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
       BIT && CheckBitIntElementType(*this, AttrLoc, BIT))
     return QualType();
 
-  if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) {
-    std::optional<llvm::APSInt> vecSize =
-        ArraySize->getIntegerConstantExpr(Context);
-    if (!vecSize) {
+  if (!SizeExpr->isTypeDependent() && !SizeExpr->isValueDependent()) {
+    std::optional<llvm::APSInt> VecSize =
+        SizeExpr->getIntegerConstantExpr(Context);
+    if (!VecSize) {
       Diag(AttrLoc, diag::err_attribute_argument_type)
-        << "ext_vector_type" << AANT_ArgumentIntegerConstant
-        << ArraySize->getSourceRange();
+          << "ext_vector_type" << AANT_ArgumentIntegerConstant
+          << SizeExpr->getSourceRange();
       return QualType();
     }
 
-    if (vecSize->isNegative()) {
-      Diag(ArraySize->getExprLoc(), diag::err_attribute_vec_negative_size);
+    if (VecSize->isNegative()) {
+      Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
       return QualType();
     }
 
-    if (!vecSize->isIntN(32)) {
+    if (!VecSize->isIntN(32)) {
       Diag(AttrLoc, diag::err_attribute_size_too_large)
-          << ArraySize->getSourceRange() << "vector";
+          << SizeExpr->getSourceRange() << "vector";
       return QualType();
     }
     // Unlike gcc's vector_size attribute, the size is specified as the
     // number of elements, not the number of bytes.
-    unsigned vectorSize = static_cast<unsigned>(vecSize->getZExtValue());
+    unsigned VectorSize = static_cast<unsigned>(VecSize->getZExtValue());
 
-    if (vectorSize == 0) {
+    if (VectorSize == 0) {
       Diag(AttrLoc, diag::err_attribute_zero_size)
-          << ArraySize->getSourceRange() << "vector";
+          << SizeExpr->getSourceRange() << "vector";
       return QualType();
     }
 
-    return Context.getExtVectorType(T, vectorSize);
+    return Context.getExtVectorType(T, VectorSize);
   }
 
-  return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc);
+  return Context.getDependentSizedExtVectorType(T, SizeExpr, AttrLoc);
 }
 
 QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,

From a8ea7f4580b467183ce2075db6b1b2ec3beb6ebf Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Mon, 3 Nov 2025 20:59:48 +0100
Subject: [PATCH 060/313] Reapply:
 [AMDGPU][UnifyDivergentExitNodes][StructurizeCFG] Add support for callbr
 instruction with inline-asm (#152161) (#166195)

Reapply #152161 with fixed 'changed' flags.
---
 .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp  |  90 +++---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp |  19 +-
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp  |   6 +
 llvm/test/CodeGen/AMDGPU/callbr.ll            |  54 ++++
 ...nify-divergent-exit-nodes-with-musttail.ll |  51 ++++
 llvm/test/CodeGen/AMDGPU/infinite-loop.ll     | 257 ++++++++++++++++--
 .../si-annotate-nested-control-flows.ll       | 100 ++++++-
 .../si-unify-exit-multiple-unreachables.ll    | 161 ++++++++++-
 llvm/test/CodeGen/AMDGPU/update-phi.ll        |  39 +++
 llvm/test/Transforms/StructurizeCFG/callbr.ll | 235 ++++++++++++++++
 10 files changed, 933 insertions(+), 79 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/callbr.ll
 create mode 100644 llvm/test/Transforms/StructurizeCFG/callbr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index ddf9a24eb5230..fe81a5efd9d51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
   return NewRetBlock;
 }
 
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+                       SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+  BasicBlock *DummyReturnBB =
+      BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+  Type *RetTy = F.getReturnType();
+  Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+  ReturningBlocks.push_back(DummyReturnBB);
+  return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+                          BasicBlock *DummyReturnBB,
+                          std::vector<DominatorTree::UpdateType> &Updates) {
+  SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+  // Create a new transition block to hold the conditional branch.
+  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+  // 'Successors' become successors of TransitionBB instead of BB,
+  // and TransitionBB becomes a single successor of BB.
+  Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+  for (BasicBlock *Successor : Successors) {
+    Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+    Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+  }
+
+  // Create a branch that will always branch to the transition block and
+  // references DummyReturnBB.
+  BB->getTerminator()->eraseFromParent();
+  BranchInst::Create(TransitionBB, DummyReturnBB,
+                     ConstantInt::getTrue(F.getContext()), BB);
+  Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
 bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
                                             const PostDominatorTree &PDT,
                                             const UniformityInfo &UA) {
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   if (PDT.root_size() == 0 ||
       (PDT.root_size() == 1 &&
-       !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+       !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
     return false;
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       if (HasDivergentExitBlock)
         UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
-      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
-      if (DummyReturnBB == nullptr) {
-        DummyReturnBB =
-            BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
-        Type *RetTy = F.getReturnType();
-        Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
-        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
-        ReturningBlocks.push_back(DummyReturnBB);
-      }
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
 
       if (BI->isUnconditional()) {
         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
-        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
-        Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-      } else { // Conditional branch.
-        SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
-        // Create a new transition block to hold the conditional branch.
-        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
-        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
-        // 'Successors' become successors of TransitionBB instead of BB,
-        // and TransitionBB becomes a single successor of BB.
-        Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
-        for (BasicBlock *Successor : Successors) {
-          Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-          Updates.emplace_back(DominatorTree::Delete, BB, Successor);
-        }
-
-        // Create a branch that will always branch to the transition block and
-        // references DummyReturnBB.
-        BB->getTerminator()->eraseFromParent();
-        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+                           ConstantInt::getTrue(F.getContext()), BB);
         Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+      } else {
+        handleNBranch(F, BB, BI, DummyReturnBB, Updates);
       }
       Changed = true;
+    } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+      handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+      Changed = true;
+    } else {
+      llvm_unreachable("unsupported block terminator");
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5f6f66a4bc213..0a8f5ea2fdae1 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   } else {
     // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
-    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
-    for (BasicBlock *Succ : Term->successors())
-      if (Visited.count(Succ))
-        Loops[Succ] = BB;
+    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+      for (BasicBlock *Succ : Term->successors())
+        if (Visited.count(Succ))
+          Loops[Succ] = BB;
   }
 }
 
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
-    if (!ParentRegion->contains(P))
+    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
       continue;
 
     Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 /// Run the transformation for each region found
 bool StructurizeCFG::run(Region *R, DominatorTree *DT,
                          const TargetTransformInfo *TTI) {
-  if (R->isTopLevelRegion())
+  // CallBr and its corresponding direct target blocks are for now ignored by
+  // this pass. This is not a limitation for the currently intended uses cases
+  // of callbr in the AMDGPU backend.
+  // Parent and child regions are not affected by this (current) restriction.
+  // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+  if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
     return false;
 
   this->DT = DT;
   this->TTI = TTI;
   Func = R->getEntry()->getParent();
-  assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
   ParentRegion = R;
 
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 94c5c1709f43e..e86ab13094b15 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
   // Redirect exiting edges through a control flow hub.
   ControlFlowHub CHub;
+  bool Changed = false;
 
   for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
     BasicBlock *BB = ExitingBlocks[I];
@@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
         bool UpdatedLI = false;
         BasicBlock *NewSucc =
             SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+        // SplitCallBrEdge modifies the CFG because it creates an intermediate
+        // block. So we need to set the changed flag no matter what the
+        // ControlFlowHub is going to do later.
+        Changed = true;
         // Even if CallBr and Succ do not have a common parent loop, we need to
         // add the new target block to the parent loop of the current loop.
         if (!UpdatedLI)
@@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   bool ChangedCFG;
   std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
       &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+  ChangedCFG |= Changed;
   if (!ChangedCFG)
     return false;
 
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000000000..253a6ec100eae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_dword v0, v[0:1]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %fallthrough
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %indirect
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[4:5], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+	%a = load i32, ptr %src, align 4
+	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+	store i32 %a, ptr %dst1, align 4
+	br label %ret
+indirect:
+	store i32 %a, ptr %dst2, align 4
+	br label %ret
+ret:
+	ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:  .LBB1_1: ; %callbr
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_1
+; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %callbr.target.ret
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0a6bdbc..076a99ff8588f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
 
 declare void @foo(ptr)
 declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
 
 define void @musttail_call_without_return_value(ptr %p) {
 ; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
   ret void
 }
 
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    musttail call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  musttail call void @foo(ptr %p)
+  ret void
+
+bb.1:
+  ret void
+}
+
 define i1 @musttail_call_with_return_value(ptr %p) {
 ; CHECK-LABEL: define i1 @musttail_call_with_return_value(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
 bb.1:
   ret i1 %load
 }
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  %ret = musttail call i32 @bar32(ptr %p)
+  ret i32 %ret
+
+bb.1:
+  ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43faca5aa..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
   br label %loop
 }
 
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP:%.*]] []
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", ""() to label %loop []
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+}
+
 define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_cbranch_execz .LBB2_3
 ; SI-NEXT:  ; %bb.1: ; %loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB1_2: ; %loop
+; SI-NEXT:  .LBB2_2: ; %loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
-; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_ret(
 ; IR-NEXT:  entry:
@@ -81,44 +115,93 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:  .LBB3_2: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond = icmp eq i32 %tmp, 1
+  %cond32 = zext i1 %cond to i32
+  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+
+return:
+  ret void
+}
+
 define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_cbranch_scc1 .LBB2_4
+; SI-NEXT:    s_cbranch_scc1 .LBB4_4
 ; SI-NEXT:  ; %bb.1:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x378
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB2_2: ; %loop2
+; SI-NEXT:  .LBB4_2: ; %loop2
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:    s_cbranch_vccnz .LBB4_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB2_4: ; %Flow2
+; SI-NEXT:  .LBB4_4: ; %Flow2
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_7
+; SI-NEXT:    s_cbranch_vccz .LBB4_7
 ; SI-NEXT:  ; %bb.5:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:  .LBB2_6: ; %loop1
+; SI-NEXT:  .LBB4_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_6
-; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT:    s_cbranch_vccz .LBB4_6
+; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loops(
 ; IR-NEXT:  entry:
@@ -144,24 +227,78 @@ loop2:
   br label %loop2
 }
 
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  .LBB5_2: ; Inline asm indirect target
+; SI-NEXT:    ; %loop2.preheader
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison)
+; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP1]] []
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
+; IR:       TransitionBlock1:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP2:%.*]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
+
+loop1:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop1 []
+
+loop2:
+  store volatile i32 888, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop2 []
+}
+
 define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB3_5
+; SI-NEXT:    s_cbranch_execz .LBB6_5
 ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:  .LBB3_2: ; %outer_loop
+; SI-NEXT:  .LBB6_2: ; %outer_loop
 ; SI-NEXT:    ; =>This Loop Header: Depth=1
-; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    ; Child Loop BB6_3 Depth 2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB3_3: ; %inner_loop
-; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:  .LBB6_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB6_2 Depth=1
 ; SI-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]
 ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_cbranch_execnz .LBB3_3
+; SI-NEXT:    s_cbranch_execnz .LBB6_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    s_branch .LBB3_2
-; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_branch .LBB6_2
+; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(
 ; IR-NEXT:  entry:
@@ -212,4 +349,82 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_nest_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 s[0:1], exec, 0
+; SI-NEXT:    s_branch .LBB7_3
+; SI-NEXT:  .LBB7_2: ; %loop.exit.guard
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccnz .LBB7_5
+; SI-NEXT:  .LBB7_3: ; %outer_loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], -1
+; SI-NEXT:    s_mov_b64 vcc, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB7_2
+; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_branch .LBB7_2
+; SI-NEXT:  .LBB7_5: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
+; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]])
+; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       outer_loop:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[INNER_LOOP:%.*]] []
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]])
+; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination
+  %cond1_32 = zext i1 %cond1 to i32
+  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
+
+outer_loop:
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  callbr void asm "", ""() to label %inner_loop []
+
+inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  %cond3 = icmp eq i32 %tmp, 3
+  %cond3_32 = zext i1 %cond3 to i32
+  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
+
+return:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 34de1e48bfb59..01bcdad3fc220 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,15 +3,16 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: @nested_inf_loop(
-; OPT-NEXT:  BB:
-; OPT-NEXT:    br label [[BB1:%.*]]
-; OPT:       BB1:
-; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
-; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
-; OPT:       infloop:
-; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
-; OPT:       DummyReturnBlock:
+; OPT-LABEL: define void @nested_inf_loop(
+; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    br label %[[BB1:.*]]
+; OPT:       [[BB1]]:
+; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
+; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
+; OPT:       [[INFLOOP]]:
+; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[DUMMYRETURNBLOCK]]:
 ; OPT-NEXT:    ret void
 ;
 ; ISA-LABEL: nested_inf_loop:
@@ -63,3 +64,84 @@ BB4:
 BB3:
   br label %BB1
 }
+
+define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
+; OPT-LABEL: define void @nested_inf_loop_callbr(
+; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1:.*]] []
+; OPT:       [[BB1]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]])
+; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2]
+; OPT:       [[BB2:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB4:.*]] []
+; OPT:       [[BB4]]:
+; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[TRANSITIONBLOCK]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]])
+; OPT-NEXT:            to label %[[BB3]] [label %BB4]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1]] []
+; OPT:       [[DUMMYRETURNBLOCK]]:
+; OPT-NEXT:    ret void
+;
+; ISA-LABEL: nested_inf_loop_callbr:
+; ISA:       ; %bb.0: ; %BB
+; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:  .LBB1_1: ; %BB1
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; ISA-NEXT:  .LBB1_2: ; %BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_branch .LBB1_1
+; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target
+; ISA-NEXT:    ; %BB2
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ; Label of block must be emitted
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; ISA-NEXT:    s_cbranch_execz .LBB1_5
+; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT:    s_mov_b64 s[6:7], 0
+; ISA-NEXT:    s_cbranch_vccz .LBB1_2
+; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
+; ISA-NEXT:    s_setpc_b64 s[30:31]
+BB:
+  callbr void asm "", ""() to label %BB1 []
+
+BB1:
+  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
+
+BB2:
+  callbr void asm "", ""() to label %BB4 []
+
+BB4:
+  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
+
+BB3:
+  callbr void asm "", ""() to label %BB1 []
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 4cbe682cf9f9f..004c27971131d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
 
 declare void @llvm.trap()
@@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_branch .LBB0_4
-
-
+; UNIFY-LABEL: @kernel(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    br label [[IF_END6]]
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %n, 256
@@ -105,5 +130,129 @@ if.end6.sink.split:
 if.end6:
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; UNIFY: {{.*}}
+
+define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
+; CHECK-LABEL: kernel_callbr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10
+; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %if.then
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target
+; CHECK-NEXT:    ; %UnifiedReturnBlock
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target
+; CHECK-NEXT:    ; %if.else
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.5: ; %if.then3
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_2
+; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false.i8
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    ; divergent unreachable
+; CHECK-NEXT:    s_branch .LBB1_3
+; UNIFY-LABEL: @kernel_callbr(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]])
+; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]])
+; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    callbr void asm "", ""()
+; UNIFY-NEXT:            to label [[IF_END6:%.*]] []
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cmp = icmp eq i32 %n, 256
+  %cmp32 = zext i1 %cmp to i32
+  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
+
+if.then:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp1_32 = zext i1 %cmp1 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
+
+cond.false:
+  call void @llvm.trap()
+  unreachable
+
+if.else:
+  %cmp2 = icmp ult i32 %tid, 10
+  %cmp2_32 = zext i1 %cmp2 to i32
+  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
+
+if.then3:
+  %cmp1.i7 = icmp eq i32 %a, 0
+  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
+
+cond.false.i8:
+  call void @llvm.trap()
+  unreachable
+
+if.end6.sink.split:
+  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+  store i32 %a, ptr addrspace(1) %x1, align 4
+  callbr void asm "", ""() to label %if.end6 []
+
+if.end6:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 50666bee325e8..684dc1a1f0092 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,3 +37,42 @@ n28:                                               ; preds = %.loopexit, %n28
 n31:                                               ; preds =
   ret void
 }
+
+define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
+; IR-LABEL: @_amdgpu_ps_main_callbr(
+; IR-NEXT:  .entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] []
+; IR:       .loopexit:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[N28:%.*]] []
+; IR:       n28:
+; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
+; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
+; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
+; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]])
+; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28]
+; IR:       n31:
+; IR-NEXT:    ret void
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+.entry:
+  callbr void asm "", ""() to label %.loopexit []
+
+.loopexit:                                        ; preds = %n28, %.entry
+  callbr void asm "", ""() to label %n28 []
+
+n28:                                               ; preds = %.loopexit, %n28
+  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
+  %n29 = fadd float %.01, 1.0
+  %n30 = fcmp ogt float %n29, 4.000000e+00
+  %n30.32 = zext i1 %n30 to i32
+  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
+
+n31:                                               ; preds =
+  ret void
+}
diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll
new file mode 100644
index 0000000000000..42f95194980d4
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
+
+; Structurize as usual, but don't tear callbr and its destination blocks apart.
+;
+; Note: currently, callbr blocks and their corresponding target blocks
+; themselves are not handled by the structurizer.* If the CFG turns out to be
+; unstructured at the end, the CFG lowering (si-annotate-control-flow) will
+; detect this. For the currently intended use cases of callbr in the context of
+; the AMDGPU backend, this is not a limitation (cf.
+; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087).
+;
+; Note 2: while callbr and its targets remain untouched, everything else is
+; handled as usual, even if it is nested in a callbr region.
+;
+; *FIXME: this will be fixed in the future. Callbr can be handled as follows:
+; Input IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...]
+; fallthrough:
+;   br label %exit
+; indirect:
+;   br label %exit
+; ...
+; exit:
+;   ret void
+; }
+; ```
+;
+; Output IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"()
+;          to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...]
+; fake.indirect:                                    ; preds = %0
+;   br label %Flow
+; fake.indirect1:                                   ; preds = %0
+;   br label %Flow
+; fake.indirect2:                                   ; preds = %0
+;   br label %Flow
+; ...
+; Flow:                                             ; preds = %fallthrough, %fake.indirect[0-N]
+;   %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ]
+;   br i1 %1, label %indirect, label %Flow1
+; Flow1:                                            ; preds = %Flow, %indirect
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ]
+;   br i1 %2, label %indirect1, label %Flow2
+; Flow2:                                            ; preds = %Flow, %indirect1
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ]
+;   br i1 %2, label %indirect2, label %Flow3
+; ...
+; fallthrough:                                      ; preds = %0
+;   br label %Flow
+; indirect:                                         ; preds = %Flow
+;   br label %Flow1
+; indirect1:                                        ; preds = %Flow1
+;   br label %Flow2
+; indirect2:                                        : preds = %Flow2
+;   br label %Flow3
+; ...
+; exit:                                             ; preds = %indirectN, %FlowN
+;   ret void
+; }
+; ```
+;
+; Output IR as ASCII-art:
+;          %0
+; ---------------------
+; |     |     |     |
+; v     v     v     v
+; f    f.i   f.i1  f.i2
+; |     |     |     |
+; v     v     v     v
+; ---------------------
+;        %Flow
+;          |   \
+;          |    %indirect
+;          |   /
+;       %Flow1
+;          |   \
+;          |    %indirect1
+;          |   /
+;       %Flow2
+;          |   \
+;          |    %indirect2
+;          |   /
+;        %exit
+;
+
+; Only callbr, nothing to do.
+define void @callbr_simple() {
+; CHECK-LABEL: define void @callbr_simple() {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr nested in non-callbr: non-callbr is transformed
+define void @callbr_in_non_callbr(i1 %c) {
+; CHECK-LABEL: define void @callbr_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+nocallbr:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr parent of non-callbr: non-callbr is transformed
+define void @non_callbr_in_callbr(i1 %c) {
+; CHECK-LABEL: define void @non_callbr_in_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[FALLTHROUGH2]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %c, label %fallthrough1, label %fallthrough2
+fallthrough1:
+  br label %exit
+fallthrough2:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr surrounded by non-callbr: all three regular branches are handled
+; correctly
+define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) {
+; CHECK-LABEL: define void @callbr_nested_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]]
+; CHECK:       [[FLOW3]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW2]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[INDIRECT1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]]
+; CHECK:       [[NOCALLBR1]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    br label %[[FLOW3]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[FLOW2]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %d, label %fallthrough1, label %ret
+fallthrough1:
+  br label %ret
+indirect:
+  br i1 %e, label %indirect1, label %ret
+indirect1:
+  br label %ret
+nocallbr:
+  br i1 %f, label %nocallbr1, label %ret
+nocallbr1:
+  br label %ret
+ret:
+  ret void
+}

From 15bbdd143cb196be8d60ea4dc813bf7bc5e4650c Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Mon, 3 Nov 2025 20:00:16 +0000
Subject: [PATCH 061/313] [lldb][windows] print an error if python.dll is not
 in the DLL search path (#164893)

This is a follow up to https://github.com/llvm/llvm-project/pull/162509.

Using the `SearchPathW` API, we can ensure that the correct version of
Python is installed before `liblldb` is loaded (and `python.dll`
subsequently). If it's not, we try to add it to the search path with the
methods introduced in https://github.com/llvm/llvm-project/pull/162509.
If that fails or if that method is `#ifdef`'d out, we print an error
which will appear before lldb crashes due to the missing dll.

Before https://github.com/llvm/llvm-project/pull/162509, when invoked
from Powershell, lldb would silently crash (no error message/crash
report). After https://github.com/llvm/llvm-project/pull/162509, it
crashes without any indications that the root cause is the missing
python.dll. With this patch, we print the error before crashing.
---
 lldb/CMakeLists.txt              |  6 +++
 lldb/tools/driver/CMakeLists.txt |  3 ++
 lldb/tools/driver/Driver.cpp     | 68 ++++++++++++++++++++++++++------
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index e3b72e94d4beb..01b5546fee00d 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -87,6 +87,12 @@ if (LLDB_ENABLE_PYTHON)
       set(LLDB_PYTHON_EXT_SUFFIX "_d${LLDB_PYTHON_EXT_SUFFIX}")
     endif()
   endif()
+  if(TARGET Python3::Python)
+    get_target_property(_Python3_LIB_PATH Python3::Python IMPORTED_LIBRARY_LOCATION)
+    if(_Python3_LIB_PATH)
+      get_filename_component(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME "${_Python3_LIB_PATH}" NAME)
+    endif()
+  endif()
 endif ()
 
 if (LLDB_ENABLE_LUA)
diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index 67956af7fe3fb..efe51506f3545 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -37,6 +37,9 @@ add_dependencies(lldb
 if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH)
   target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}")
 endif()
+if(DEFINED LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME)
+  target_compile_definitions(lldb PRIVATE LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME="${LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME}")
+endif()
 
 if(LLDB_BUILD_FRAMEWORK)
   # In the build-tree, we know the exact path to the framework directory.
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 733331f4ddac0..bebf1a70d50e9 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -433,7 +433,8 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) {
   return error;
 }
 
-#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+#ifdef _WIN32
+#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH
 /// Returns the full path to the lldb.exe executable.
 inline std::wstring GetPathToExecutableW() {
   // Iterate until we reach the Windows API maximum path length (32,767).
@@ -447,30 +448,73 @@ inline std::wstring GetPathToExecutableW() {
   return L"";
 }
 
-/// Resolve the full path of the directory defined by
+/// \brief Resolve the full path of the directory defined by
 /// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL
 /// search directories.
-void AddPythonDLLToSearchPath() {
+/// \return `true` if the library was added to the search path.
+/// `false` otherwise.
+bool AddPythonDLLToSearchPath() {
   std::wstring modulePath = GetPathToExecutableW();
-  if (modulePath.empty()) {
-    llvm::errs() << "error: unable to find python.dll." << '\n';
-    return;
-  }
+  if (modulePath.empty())
+    return false;
 
   SmallVector<char, MAX_PATH> utf8Path;
   if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(),
                                 utf8Path))
-    return;
+    return false;
   sys::path::remove_filename(utf8Path);
   sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH);
   sys::fs::make_absolute(utf8Path);
 
   SmallVector<wchar_t, 1> widePath;
   if (sys::windows::widenPath(utf8Path.data(), widePath))
-    return;
+    return false;
 
   if (sys::fs::exists(utf8Path))
-    SetDllDirectoryW(widePath.data());
+    return SetDllDirectoryW(widePath.data());
+  return false;
+}
+#endif
+
+#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
+/// Returns whether `python3x.dll` is in the DLL search path.
+bool IsPythonDLLInPath() {
+#define WIDEN2(x) L##x
+#define WIDEN(x) WIDEN2(x)
+  WCHAR foundPath[MAX_PATH];
+  DWORD result =
+      SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr,
+                  MAX_PATH, foundPath, nullptr);
+#undef WIDEN2
+#undef WIDEN
+
+  return result > 0;
+}
+#endif
+
+/// Try to setup the DLL search path for the Python Runtime Library
+/// (python3xx.dll).
+///
+/// If `LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME` is set, we first check if
+/// python3xx.dll is in the search path. If it's not, we try to add it and
+/// check for it a second time.
+/// If only `LLDB_PYTHON_DLL_RELATIVE_PATH` is set, we try to add python3xx.dll
+/// to the search path python.dll is already in the search path or not.
+void SetupPythonRuntimeLibrary() {
+#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
+  if (IsPythonDLLInPath())
+    return;
+#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH
+  if (AddPythonDLLToSearchPath() && IsPythonDLLInPath())
+    return;
+#endif
+  llvm::errs() << "error: unable to find '"
+               << LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME << "'.\n";
+  return;
+#elif defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+  if (!AddPythonDLLToSearchPath())
+    llvm::errs() << "error: unable to find the Python runtime library.\n";
+#endif
 }
 #endif
 
@@ -776,8 +820,8 @@ int main(int argc, char const *argv[]) {
                         "~/Library/Logs/DiagnosticReports/.\n");
 #endif
 
-#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
-  AddPythonDLLToSearchPath();
+#ifdef _WIN32
+  SetupPythonRuntimeLibrary();
 #endif
 
   // Parse arguments.

From d4ca474ca3105743311130134562b40fddce7a56 Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Mon, 3 Nov 2025 12:03:43 -0800
Subject: [PATCH 062/313] [libc++][docs] Fix documentation of `REQUIRES:
 std-at-least-*` (#166221)

---
 libcxx/docs/TestingLibcxx.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index dbe69484abedf..062c09571eafe 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -451,7 +451,7 @@ Instead use:
 
 .. code-block:: cpp
 
-   // UNSUPPORTED: std-at-least-c++26
+   // REQUIRES: requires-std-at-least-c++26
 
 There is no corresponding ``std-at-most-c++23``. This could be useful when
 tests are only valid for a small set of standard versions. For example, a

From 42b608cdd6cd96e4fab7e4311731d6b0956376ad Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Mon, 3 Nov 2025 12:08:56 -0800
Subject: [PATCH 063/313] [lldb] Skip tests on older versions of clang

---
 .../API/commands/expression/weak_symbols/TestWeakSymbols.py     | 2 +-
 .../TestLibcxxInternalsRecognizer.py                            | 2 +-
 .../API/lang/objc/modules-auto-import/TestModulesAutoImport.py  | 1 +
 .../lang/objc/modules-objc-property/TestModulesObjCProperty.py  | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
index 50efecbc88c36..bed129a7a7a8c 100644
--- a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
+++ b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
@@ -15,7 +15,7 @@ class TestWeakSymbolsInExpressions(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @skipUnlessDarwin
-    @skipIf(compiler="clang", compiler_version=["<", "7.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_weak_symbol_in_expr(self):
         """Tests that we can refer to weak symbols in expressions."""
         self.build()
diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
index 2f942da604ff2..d8a729b322fe4 100644
--- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
@@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<=", "19.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_frame_recognizer(self):
         """Test that implementation details of libc++ are hidden"""
         self.build()
diff --git a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
index 142d27ddad37f..f3558f62d51f8 100644
--- a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
+++ b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
@@ -16,6 +16,7 @@ def setUp(self):
         self.line = line_number("main.m", "// Set breakpoint 0 here.")
 
     @skipIf(macos_version=["<", "10.12"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_expr(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
index 3be064ae7d5f8..657a7103ee989 100644
--- a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
+++ b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
@@ -6,6 +6,7 @@
 
 class TestCase(TestBase):
     @no_debug_info_test
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_conflicting_properties(self):
         """Tests receiving two properties with the same name from modules."""
         self.build()

From 009706ff6295882a17fb2af1a1eebdfe7c476114 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 3 Nov 2025 20:09:18 +0000
Subject: [PATCH 064/313] [Github] Use truncated body in llvm-bugs.yml

\#166081 forgot to actually use this as the body.
---
 .github/workflows/llvm-bugs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index cd3f396e7c465..3274f1adf9e61 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -52,7 +52,7 @@ jobs:
                 url    : issue.data.html_url,
                 labels : issue.data.labels.map((label) => label.name),
                 assignee : issue.data.assignees.map((assignee) => assignee.login),
-                body   : issue.data.body
+                body   : maybeTruncatedBody
               };
 
               const data = {

From dccced25a01478c339e37fd7ef30c0958cb43742 Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Mon, 3 Nov 2025 12:13:27 -0800
Subject: [PATCH 065/313] [libc++][docs] Fix documentation of `REQUIRES:
 std-at-least-*` (#166226)

Due to me not double-checking my PR, an overly eager AI auto-completion
made it into my previous PR :/
---
 libcxx/docs/TestingLibcxx.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 062c09571eafe..e15c5b1a5d32f 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -451,7 +451,7 @@ Instead use:
 
 .. code-block:: cpp
 
-   // REQUIRES: requires-std-at-least-c++26
+   // REQUIRES: std-at-least-c++26
 
 There is no corresponding ``std-at-most-c++23``. This could be useful when
 tests are only valid for a small set of standard versions. For example, a

From f02b661054547b423177c9498cdb554f5036a3e0 Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Mon, 3 Nov 2025 12:18:43 -0800
Subject: [PATCH 066/313] [libc++] Add move constructor & assignment to
 `exception_ptr` (#164281)

This commit adds move constructor, move assignment and `swap`
to `exception_ptr`. Adding those operators allows us to avoid
unnecessary calls to `__cxa_{inc,dec}rement_refcount`.

Performance results (from libc++'s CI):

```
Benchmark                               Baseline    Candidate    Difference    % Difference
------------------------------------  ----------  -----------  ------------  --------------
bm_exception_ptr_copy_assign_nonnull        9.77         9.94          0.18           1.79%
bm_exception_ptr_copy_assign_null          10.29        10.65          0.35           3.42%
bm_exception_ptr_copy_ctor_nonnull          7.02         7.01         -0.01          -0.13%
bm_exception_ptr_copy_ctor_null            10.54        10.60          0.06           0.56%
bm_exception_ptr_move_assign_nonnull       16.92        13.76         -3.16         -18.70%
bm_exception_ptr_move_assign_null          10.61        10.76          0.14           1.36%
bm_exception_ptr_move_ctor_nonnull         13.31        10.25         -3.06         -23.02%
bm_exception_ptr_move_ctor_null            10.28         7.30         -2.98         -28.95%
bm_exception_ptr_swap_nonnull              19.22         0.63        -18.59         -96.74%
bm_exception_ptr_swap_null                 20.02         7.79        -12.23         -61.07%
```

As expected, the `bm_exception_ptr_copy_*` benchmarks are not influenced by
this change. `bm_exception_ptr_move_*` benefits between 18% and 30%. The
`bm_exception_ptr_swap_*` tests show the biggest improvements since multiple
calls to the copy constructor are replaced by a simple pointer swap.

While `bm_exception_ptr_move_assign_null` did not show a regression in the CI
measurements, local measurements showed a regression from 3.98 to 4.71, i.e. by
18%. This is due to the additional `__tmp` inside `operator=`. The destructor
of `__other` is a no-op after the move because `__other.__ptr` will be a
nullptr. However, the compiler does not realize this, since the destructor is
not inlined and is lacking a fast-path. As such, the swap-based implementation
leads to an additional destructor call. `bm_exception_ptr_move_assign_nonnull`
still benefits because the swap-based move constructor avoids unnecessary
__cxa_{in,de}crement_refcount calls. As soon as we inline the destructor, this
regression should disappear again.

Works towards #44892
---
 libcxx/include/__exception/exception_ptr.h    | 23 ++++++++++
 libcxx/modules/std/exception.inc              |  1 +
 .../propagation/exception_ptr.pass.cpp        |  1 -
 .../exception_ptr_move_assignment.pass.cpp    | 45 +++++++++++++++++++
 .../exception_ptr_move_ctr.pass.cpp           | 43 ++++++++++++++++++
 .../propagation/exception_ptr_swap.pass.cpp   | 40 +++++++++++++++++
 6 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp

diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 796fa924be121..e78126ea23852 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -16,6 +16,8 @@
 #include <__memory/construct_at.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_pointer.h>
+#include <__utility/move.h>
+#include <__utility/swap.h>
 #include <cstdlib>
 #include <typeinfo>
 
@@ -23,6 +25,9 @@
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #ifndef _LIBCPP_ABI_MICROSOFT
 
 #  if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
@@ -57,6 +62,8 @@ _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 
 #ifndef _LIBCPP_ABI_MICROSOFT
 
+inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT;
+
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
   void* __ptr_;
 
@@ -75,7 +82,15 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
 
   exception_ptr(const exception_ptr&) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI exception_ptr(exception_ptr&& __other) _NOEXCEPT : __ptr_(__other.__ptr_) {
+    __other.__ptr_ = nullptr;
+  }
   exception_ptr& operator=(const exception_ptr&) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI exception_ptr& operator=(exception_ptr&& __other) _NOEXCEPT {
+    exception_ptr __tmp(std::move(__other));
+    std::swap(__tmp, *this);
+    return *this;
+  }
   ~exception_ptr() _NOEXCEPT;
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_ != nullptr; }
@@ -88,10 +103,16 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
     return !(__x == __y);
   }
 
+  friend _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT;
+
   friend _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
   friend _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 };
 
+inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT {
+  std::swap(__x.__ptr_, __y.__ptr_);
+}
+
 #  if _LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
 template <class _Ep>
@@ -201,4 +222,6 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #endif // _LIBCPP_ABI_MICROSOFT
 _LIBCPP_END_UNVERSIONED_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
diff --git a/libcxx/modules/std/exception.inc b/libcxx/modules/std/exception.inc
index 02b0f80190e5b..3dbc0112c15a0 100644
--- a/libcxx/modules/std/exception.inc
+++ b/libcxx/modules/std/exception.inc
@@ -18,6 +18,7 @@ export namespace std {
   using std::rethrow_exception;
   using std::rethrow_if_nested;
   using std::set_terminate;
+  using std::swap;
   using std::terminate;
   using std::terminate_handler;
   using std::throw_with_nested;
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
index 0aded33e660d5..7e25d40dc8a7d 100644
--- a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
@@ -14,7 +14,6 @@
 
 #include <exception>
 #include <cassert>
-#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
new file mode 100644
index 0000000000000..6882bc6548da3
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions, c++03
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test the move assignment of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p = std::make_exception_ptr(42);
+  std::exception_ptr p2{p};
+  assert(p2 == p);
+  // Under test: the move assignment
+  std::exception_ptr p3;
+  p3 = std::move(p2);
+  assert(p3 == p);
+// `p2` was moved from. In libc++ it will be nullptr, but
+// this is not guaranteed by the standard.
+#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT)
+  assert(p2 == nullptr);
+  assert(p2 == nullptr);
+#endif
+
+  try {
+    std::rethrow_exception(p3);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
new file mode 100644
index 0000000000000..122e229fd6e47
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions, c++03
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test the move constructor of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p = std::make_exception_ptr(42);
+  std::exception_ptr p2{p};
+  assert(p2 == p);
+  // Under test: The move constructor
+  std::exception_ptr p3{std::move(p2)};
+  assert(p3 == p);
+// `p2` was moved from. In libc++ it will be nullptr, but
+// this is not guaranteed by the standard.
+#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT)
+  assert(p2 == nullptr);
+#endif
+
+  try {
+    std::rethrow_exception(p3);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp
new file mode 100644
index 0000000000000..82b4713bed538
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test swapping of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p21 = std::make_exception_ptr(42);
+  std::exception_ptr p42 = std::make_exception_ptr(21);
+  std::swap(p42, p21);
+
+  try {
+    std::rethrow_exception(p21);
+  } catch (int e) {
+    assert(e == 21);
+  }
+  try {
+    std::rethrow_exception(p42);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}

From ccc473254fd2d0da01921e8402fbd4f678ff46f1 Mon Sep 17 00:00:00 2001
From: Doug Wyatt <doug@sonosphere.com>
Date: Mon, 3 Nov 2025 12:26:49 -0800
Subject: [PATCH 067/313] [Clang] FunctionEffects: properly extract the type of
 a bound member member function from a CallExpr. (#166101)

There's a bug illustrated by this example:

```
template <typename T>
struct Holder {
	T value;

	T& operator*() { return value; }
};

struct X {
	using Dispatch = float (X::*)() [[clang::nonblocking]];

	void fails(Holder<Dispatch>& holder) [[clang::nonblocking]]
	{
		(this->*(*holder))();   <<< the expression is incorrectly determined not to be nonblocking
	}

	void succeeds(Holder<Dispatch>& holder) [[clang::nonblocking]]
	{
		auto func = *holder;
		(this->*func)();
	}
};
```

In both cases we have a `CXXMemberCallExpr`. In `succeeds`, the
expression refers to a `Decl` (`func`) and gets a useful PTMF type. In
`fails`, the expression does not refer to a `Decl` and its type is
special, printed as `bound member function`. `Expr` provides a method
for extracting the true type so we can use that in this situation.

---------

Co-authored-by: Doug Wyatt <dwyatt@apple.com>
Co-authored-by: Sirraide <aeternalmail@gmail.com>
---
 clang/lib/Sema/SemaFunctionEffects.cpp        | 12 +++++--
 .../Sema/attr-nonblocking-constraints.cpp     | 33 +++++++++++++++----
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 8590ee831084f..5459861ec349d 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -1208,8 +1208,16 @@ class Analyzer {
         return true;
       }
 
-      // No Decl, just an Expr. Just check based on its type.
-      checkIndirectCall(Call, CalleeExpr->getType());
+      // No Decl, just an Expr. Just check based on its type. Bound member
+      // functions are a special expression type and need to be specially
+      // unpacked.
+      QualType CalleeExprQT = CalleeExpr->getType();
+      if (CalleeExpr->isBoundMemberFunction(Outer.S.getASTContext())) {
+        QualType QT = Expr::findBoundMemberType(CalleeExpr);
+        if (!QT.isNull())
+          CalleeExprQT = QT;
+      }
+      checkIndirectCall(Call, CalleeExprQT);
 
       return true;
     }
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index b26a945843696..0d2dbb4947dc8 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -235,16 +235,35 @@ void nb13() [[clang::nonblocking]] { nb12(); }
 // C++ member function pointers
 struct PTMFTester {
 	typedef void (PTMFTester::*ConvertFunction)() [[clang::nonblocking]];
-
-	void convert() [[clang::nonblocking]];
+	typedef void (PTMFTester::*BlockingFunction)();
 
 	ConvertFunction mConvertFunc;
-};
 
-void PTMFTester::convert() [[clang::nonblocking]]
-{
-	(this->*mConvertFunc)();
-}
+	void convert() [[clang::nonblocking]]
+	{
+		(this->*mConvertFunc)(); // This should not generate a warning.
+	}
+
+	template <typename T>
+	struct Holder {
+		T value;
+		
+		T& operator*() { return value; }
+	};
+
+
+	void ptmfInExpr(Holder<ConvertFunction>& holder) [[clang::nonblocking]]
+	{
+		(this->*(*holder))();   // Should not generate a warning.
+		((*this).*(*holder))(); // Should not generate a warning.
+	}
+
+	void ptmfInExpr(Holder<BlockingFunction>& holder) [[clang::nonblocking]]
+	{
+		(this->*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
+		((*this).*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
+	}
+};
 
 // Allow implicit conversion from array to pointer.
 void nb14(unsigned idx) [[clang::nonblocking]]

From c081fb058831cedce8466809e0d15daadd3ccad2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 3 Nov 2025 12:30:11 -0800
Subject: [PATCH 068/313] [RISCV] Removed unused OPERAND_SIMM8. NFC (#166215)

---
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index e75dfe33814c6..5b8cfb2100b26 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -407,7 +407,6 @@ enum OperandType : unsigned {
   OPERAND_SIMM5_PLUS1,
   OPERAND_SIMM6,
   OPERAND_SIMM6_NONZERO,
-  OPERAND_SIMM8,
   OPERAND_SIMM8_UNSIGNED,
   OPERAND_SIMM10,
   OPERAND_SIMM10_LSB0000_NONZERO,

From 0623497a0fcdb8cd32139c184c9f0d70dcd690f1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 3 Nov 2025 12:35:06 -0800
Subject: [PATCH 069/313] [RISCV] Mark FLH as canFoldAsLoad. (#165974)

---
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td  |  1 +
 llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll | 41 +++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index c31713e967b18..1c6a5afcda49b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasHalfFPLoadStoreMove] in {
+let canFoldAsLoad = 1 in
 def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
new file mode 100644
index 0000000000000..bf0a2e5e35a01
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s
+
+; CHECK-LABEL:  .section	.llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   0
+; Num Functions
+; CHECK-NEXT:   .word   1
+; Num LargeConstants
+; CHECK-NEXT:   .word   0
+; Num Callsites
+; CHECK-NEXT:   .word   1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad   liveArgs
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT:   .quad   1
+
+; Spilled stack map values.
+;
+; Verify 3 stack map entries.
+;
+; CHECK-LABEL:  .word   .L{{.*}}-liveArgs
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   25
+;
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
+; CHECK:        .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) {
+entry:
+  call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29)
+  ret void
+}

From ca00234c09e03bdb3471c83a24f1b8bc1fdb31f9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 3 Nov 2025 12:35:54 -0800
Subject: [PATCH 070/313] [RISCV] Correct comments in rv64-stackmap.ll to not
 use X86 register name. NFC (#165912)

Note, X86 forces a frame pointer for stackmaps/patchpoint. So they use
RBP where we use SP.
---
 llvm/test/CodeGen/RISCV/rv64-stackmap.ll | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index c50a0fb3ffe91..320a3aa94cd7d 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -286,8 +286,8 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .half   28
 ;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect RBP + ...
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
 ; CHECK:        .byte   3
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -307,7 +307,7 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 1 location
 ; CHECK-NEXT:   .half   1
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -320,14 +320,14 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 2 locations
 ; CHECK-NEXT:   .half   2
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
 ; CHECK-NEXT:   .half   2
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .word
-; Loc 1: Direct RBP - ofs
+; Loc 1: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8

From 68c4c83bcbf9612a02074b946fe6bb73054183ef Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <71938912+akroviakov@users.noreply.github.com>
Date: Mon, 3 Nov 2025 21:48:27 +0100
Subject: [PATCH 071/313] [MLIR][XeGPU] Matrix load/store subgroup distribution
 (#165008)

---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h    |   4 +-
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  35 ++--
 .../mlir/Dialect/XeGPU/Transforms/Passes.td   |   2 +-
 .../Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp    |   2 +
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 162 ++++++++-------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  38 +++-
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 186 +++++++++++++++++-
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  10 +-
 mlir/test/Dialect/XeGPU/invalid.mlir          |  31 +--
 .../Dialect/XeGPU/subgroup-distribute.mlir    |  63 ++++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |   3 +-
 11 files changed, 417 insertions(+), 119 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 1481859e94a92..0c059967bb898 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -30,9 +30,11 @@ class SliceAttr;
 } // namespace xegpu
 } // namespace mlir
 
+// clang-format off
+#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
-#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+// clang-format on
 
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 40352b44b6441..9c35c07a7e587 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -223,17 +223,17 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Derive a new layout by dropping InstData",
                     "xegpu::DistributeLayoutAttr",
                     "dropInstData">,
-    InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional
-                      indices based on the effective subgroup layout.}],
+    InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
+                      indices based on the effective layout level.}],
                     "FailureOr<SmallVector<Value>>",
-                    "delinearizeSubgroupId",
+                    "delinearizeId",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
-    InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks
-                      assigned to a subgroup identified by linearId. The shape parameter
-                      represents the workgroup-level problem size. Each subgroup may access
+    InterfaceMethod<[{Generates instructions to compute multidimensional coordinates for dist units
+                      assigned to a level identified by linearId. The shape parameter
+                      represents the higher-level problem size. Each level may access
                       multiple blocks according to round-robin distribution rules.}],
                     "FailureOr<SmallVector<SmallVector<Value>>>",
-                    "getOffsets",
+                    "computeDistributedCoords",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>,
     InterfaceMethod</*desc=*/[{Check if this layout can be achieved by applying a transpose
                      to some other layout according to given permutation of (0...n-1).}],
@@ -476,17 +476,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return {};
     }
 
-    /// Delinearizes a linear subgroup ID into its multidimensional indices
-    /// based on the effective subgroup layout.
+    /// Delinearizes a linear ID into its multidimensional indices
+    /// based on the effective level of the layout.
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+    delinearizeId(OpBuilder &builder, Location loc, Value linearId);
 
-    /// Generates instructions to compute multidimensional offsets for blocks
-    /// assigned to a subgroup identified by linearId. The shape parameter
-    /// represents the workgroup-level problem size. Each subgroup may access
+    /// Generates instructions to compute multidimensional coordinates for dist units
+    /// assigned to a level identified by linearId. The shape parameter
+    /// represents the higher-level problem size. Each `level` may access
     /// multiple blocks according to round-robin distribution rules.
     FailureOr<SmallVector<SmallVector<Value>>>
-    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
@@ -643,14 +643,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     /// Delinearizes a linear subgroup ID into its multidimensional indices
     /// based on the effective subgroup layout.
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+    delinearizeId(OpBuilder &builder, Location loc, Value linearId);
 
-    /// Generates instructions to compute multidimensional offsets for blocks
+    /// Generates instructions to compute multidimensional coordinates for blocks
     /// assigned to a subgroup identified by linearId. The shape parameter
     /// represents the workgroup-level problem size. Each subgroup may access
     /// multiple blocks according to round-robin distribution rules.
+
     FailureOr<SmallVector<SmallVector<Value>>>
-    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index b7af5413669c9..eb05628d4772b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -26,7 +26,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
     The pass distributes subgroup level (SIMD) XeGPU ops to work items.
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
-                           "vector::VectorDialect"];
+                           "vector::VectorDialect", "index::IndexDialect"];
 }
 
 def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 33e8f2ed1f6ed..de552ce22ef62 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -562,6 +562,8 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
     VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
     if (!valOrResVecTy)
       valOrResVecTy = VectorType::get(1, data.getType());
+    if (valOrResVecTy.getShape().size() != 1)
+      return rewriter.notifyMatchFailure(op, "Expected 1D data vector.");
 
     int64_t elemBitWidth =
         valOrResVecTy.getElementType().getIntOrFloatBitWidth();
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 83406c8c75dcf..397107b786c9e 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -37,55 +37,61 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-/// Generates instructions to compute offsets for a subgroup identified by
-/// its multidimensional indices (sgId), using the specified subgroup layout
-/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data
-/// dimensions (sizePerWg).
+// A `srcShape` consists of N distribution units, each being `subShapesLayout` x
+// `subShape`. A `delinearizedId` is used to identify a particular `subShape`
+// within each distribution unit.
+// Example:
+// WG data is 128x256. SG data is 16x32, in 4x2 layout, this gives a
+// distribution unit of shape 64x64, we have 2x4 such distribution units.
+// `delinearizedId` is used to identify a 16x32 of a subgroup in each
+// distribution unit.
 static SmallVector<SmallVector<Value>>
-genOffsetsComputingInsts(OpBuilder &builder, Location loc,
-                         SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
-                         ArrayRef<int64_t> sizePerSg,
-                         ArrayRef<int64_t> sizePerWg) {
-
-  SmallVector<SmallVector<Value>> offsets;
+genCoordinates(OpBuilder &builder, Location loc,
+               SmallVector<Value> delinearizedId,
+               ArrayRef<int64_t> subShapesLayout, ArrayRef<int64_t> subShape,
+               ArrayRef<int64_t> srcShape) {
+  SmallVector<SmallVector<Value>> coordinates;
+
+  // A distribution unit must be less than or equal to `srcShape`
+  SmallVector<int64_t> distUnitShape = llvm::map_to_vector(
+      llvm::zip_equal(srcShape,
+                      computeElementwiseMul(subShapesLayout, subShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
 
-  // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i]
-  SmallVector<Value> localOffsets = llvm::map_to_vector(
-      llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value {
+  // Get the offset of `subShape` within a distribution unit.
+  SmallVector<Value> distUnitLocalOffset = llvm::map_to_vector(
+      llvm::zip(delinearizedId, subShape), [&](const auto &t) -> Value {
         return builder.createOrFold<index::MulOp>(
             loc, std::get<0>(t),
             builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
       });
 
-  // distUnit[i] is the minimum value between sizePerWg[i] and
-  // sgLayout[i] * sizePerSg[i]
-  SmallVector<int64_t> distUnit = llvm::map_to_vector(
-      llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)),
-      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
+  // For each dist unit
   for (SmallVector<int64_t> unitOffs :
-       StaticTileOffsetRange(sizePerWg, distUnit)) {
+       StaticTileOffsetRange(srcShape, distUnitShape)) {
+    // Get dist unit offset within `srcShape`.
     SmallVector<Value> base =
         llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
           return arith::ConstantIndexOp::create(builder, loc, d);
         });
-
-    SmallVector<Value> adds = llvm::map_to_vector(
-        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
-                                                     std::get<1>(t));
-        });
-
+    // Calculate `subShape` offset within `srcShape`.
+    SmallVector<Value> adds =
+        llvm::map_to_vector(llvm::zip_equal(base, distUnitLocalOffset),
+                            [&](const auto &t) -> Value {
+                              return builder.createOrFold<arith::AddIOp>(
+                                  loc, std::get<0>(t), std::get<1>(t));
+                            });
+    // Do not go beyond `srcShape` bounds.
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, srcShape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               arith::ConstantIndexOp::create(builder, loc, std::get<1>(t)));
         });
 
-    offsets.push_back(mods);
+    coordinates.push_back(mods);
   }
-  return offsets;
+  return coordinates;
 }
 
 // Checks if the given shape can be evenly distributed based on the layout
@@ -272,12 +278,7 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
-                                  Value linearId) {
-  // delinearizeSubgroupId is only available for
-  // workgroup-level layout attribute
-  if (!isForWorkgroup())
-    return failure();
+LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
 
   // TODO: handle order attribute
   auto hasDefaultOrder = [&]() {
@@ -287,41 +288,52 @@ LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
   };
   if (!hasDefaultOrder())
     return mlir::emitError(loc, "order attribute is currently not supported.");
-
-  auto dims =
-      llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value {
-        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
-      });
+  SmallVector<int64_t> layout;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+  } else {
+    return failure();
+  }
+  auto dims = llvm::map_to_vector(layout, [&](int64_t d) -> Value {
+    return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
+  });
 
   return affine::delinearizeIndex(builder, loc, linearId, dims);
 }
 
-/// Implements DistributeLayoutAttr::getOffsets to generate
+/// Implements DistributeLayoutAttr::computeDistributedCoords to generate
 /// instructions for computing multi-dimensional offsets when distributed by
 /// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
-                       ArrayRef<int64_t> shape) {
-  if (!isForWorkgroup())
+LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
+                                     Value linearId, ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  } else {
     return failure();
-
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
-  if (sgShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, sgLayout))
-      sgShape = derivedShape.value();
+  }
+  if (subShape.empty()) {
+    if (auto derivedShape = computeShapeRatio(shape, layout))
+      subShape = derivedShape.value();
     else
       return failure();
   }
 
   // delinearize Ids
-  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  auto maybeIds = delinearizeId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
-  SmallVector<Value> sgIds = *maybeIds;
+  SmallVector<Value> ids = *maybeIds;
 
-  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
-                                  shape);
+  return genCoordinates(builder, loc, ids, layout, subShape, shape);
 }
 
 //===----------------------------------------------------------------------===//
@@ -375,34 +387,43 @@ SliceAttr SliceAttr::flatten() const {
 }
 
 FailureOr<SmallVector<Value>>
-SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
-                                 Value linearId) {
+SliceAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
   SliceAttr attr = flatten();
   auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-  return parent.delinearizeSubgroupId(builder, loc, linearId);
+  return parent.delinearizeId(builder, loc, linearId);
 }
 
-/// Implements DistributeLayoutAttr::getOffsets to generate
-/// instructions for computing multi-dimensional offsets when distributed by
-/// SliceAttr.
+// Implements DistributeLayoutAttr::computeDistributedCoords to generate
+// instructions for computing multi-dimensional offsets when distributed by
+// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
-                      ArrayRef<int64_t> shape) {
+SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
+                                    Value linearId, ArrayRef<int64_t> shape) {
   assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
   if (!isForWorkgroup())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
-  if (sgShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, sgLayout))
-      sgShape = derivedShape.value();
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  } else {
+    return failure();
+  }
+
+  if (subShape.empty()) {
+    if (auto derivedShape = computeShapeRatio(shape, layout))
+      subShape = derivedShape.value();
     else
       return failure();
   }
 
   // delinearize Ids
-  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  auto maybeIds = delinearizeId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
 
@@ -412,8 +433,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   SmallVector<Value> sgIds =
       XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
-  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
-                                  shape);
+  return genCoordinates(builder, loc, sgIds, layout, subShape, shape);
 }
 
 bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index abd12e2e69ac0..7b6c4b6c2c813 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -175,13 +175,13 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
 
 LogicalResult
 IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
-                      UnitAttr subgroup_block_io,
+                      UnitAttr subgroup_block_io, DistributeLayoutAttr layout,
                       function_ref<InFlightDiagnostic()> emitError) {
 
   if (!dataTy) {
     if (subgroup_block_io)
       return emitError() << "subgroup_block_io "
-                            "are only allowed when result is a 1D VectorType.";
+                            "are only allowed when result is a VectorType.";
     else
       return success();
   }
@@ -192,15 +192,37 @@ IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
   ArrayRef<int64_t> dataShape = dataTy.getShape();
   ArrayRef<int64_t> mdescShape = mdescTy.getShape();
 
+  SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+  ArrayAttr strideAttr = mdescTy.getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+  if (subgroup_block_io && layout) {
+    auto laneData = layout.getEffectiveLaneDataAsInt();
+    auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+    if (!laneData.empty()) {
+      bool isLaneDataContiguous =
+          std::all_of(laneData.begin(), std::prev(laneData.end()),
+                      [](int x) { return x == 1; });
+      if (!isLaneDataContiguous)
+        return emitError() << "With subgroup_block_io, accessed data must be "
+                              "contiguous and coalesced.";
+      for (size_t i = 0; i < laneData.size(); ++i) {
+        if (laneLayout[i] != blockShape[i])
+          return emitError() << "With subgroup_block_io, the block shape must "
+                                "match the lane layout.";
+        if (laneLayout[i] != 1 && strides[i] != 1)
+          return emitError() << "With subgroup_block_io, the distributed "
+                                "dimensions must be contiguous.";
+      }
+    }
+  }
   if (dataShape.size() == 2) {
-    if (subgroup_block_io)
-      return emitError() << "subgroup_block_io "
-                            "are only allowed when result is a 1D VectorType.";
     if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
                      [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
       return emitError() << "data shape must not exceed mem_desc shape.";
   } else {
-    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
     // if the subgroup_block_io attribute is set,  mdescTy must have block
     // attribute
     if (subgroup_block_io && !blockShape.size())
@@ -1105,7 +1127,7 @@ LogicalResult LoadMatrixOp::verify() {
   MemDescType mdescTy = getMemDesc().getType();
 
   return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1129,7 +1151,7 @@ LogicalResult StoreMatrixOp::verify() {
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
   return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 5a3b27ec6108e..bbd7733e89c29 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
@@ -912,6 +913,186 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
+    PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
+    Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
+  SmallVector<Value> newCoods;
+  auto maybeCoords =
+      layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
+  if (failed(maybeCoords))
+    return {};
+  assert(maybeCoords.value().size() == 1 &&
+         "Expected one set of distributed offsets");
+  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
+      rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
+      getAsOpFoldResult(origOffsets));
+  newCoods = llvm::to_vector(llvm::map_range(
+      ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); }));
+  return newCoods;
+}
+
+/// Pattern for distributing xegpu::LoadMatrixOp.
+struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
+      return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
+    });
+    if (!producedByLastLoad)
+      return rewriter.notifyMatchFailure(
+          warpOp, "The last op is not xegpu::LoadMatrixOp");
+    const int operandIdx = producedByLastLoad->getOperandNumber();
+
+    VectorType sgPayloadTy =
+        dyn_cast<VectorType>(matrixOp.getResult().getType());
+    VectorType warpResultTy =
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the load op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "Failed to distribute matrix op payload based on layout.");
+
+    SmallVector<Value> operands = {matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newCoords = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newCoords = computeDistributedCoordinatesForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
+    }
+    xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
+        rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
+        newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    // Resolve the output type and replace all uses.
+    rewriter.replaceAllUsesWith(
+        newWarpOp.getResult(operandIdx),
+        resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
+    return success();
+  }
+};
+
+/// Pattern for distributing xegpu::StoreMatrixOp.
+struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the store op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "Failed to distribute matrix op payload based on layout.");
+
+    SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+    operandTypes[0] = *distPayloadByWarpOpOrFailure;
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newCoords = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newCoords = computeDistributedCoordinatesForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
+    }
+
+    xegpu::StoreMatrixOp::create(
+        rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
+        ValueRange(newCoords), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    rewriter.eraseOp(matrixOp);
+    return success();
+  }
+};
+
 /// Distribute a scattered load op. The logic and requirements are the same as
 /// for the scattered store distribution. The warpOp's payload vector is
 /// expected to be distributed by the load's result consumer.
@@ -1443,7 +1624,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
                LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
                GpuBarrierDistribution, VectorMultiReductionDistribution,
                LoadDistribution, StoreDistribution, VectorTransposeDistribution,
-               VectorBitcastDistribution,
+               VectorBitcastDistribution, LoadMatrixDistribution,
+               StoreMatrixDistribution,
                MemrefExtractAlignedPointerAsIndexDistribution>(
       patterns.getContext(),
       /*pattern benefit=*/regularPatternBenefit);
@@ -1468,6 +1650,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       // Layouts are needed for vector type only.
       if (!isa<VectorType>(operand.get().getType()))
         continue;
+      if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
+        continue;
 
       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 9fc5ad9af5c7b..79eea55c8b78a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -114,7 +114,8 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
   // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory
   // descriptors to be accessed, based on the layout information.
   ArrayRef<int64_t> wgShape = op.getDataShape();
-  auto maybeDescOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+  auto maybeDescOffsets =
+      layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
   if (failed(maybeDescOffsets))
     return failure();
 
@@ -830,8 +831,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
       // Get subgroup id
       Value sgId =
           gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-
-      auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+      auto sgOffsets =
+          layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
       if (failed(sgOffsets))
         return failure();
 
@@ -1052,7 +1053,8 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+    auto sgOffsets =
+        layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
     if (failed(sgOffsets))
       return failure();
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index ebbe3ce0ec0d0..92f353717ac59 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -451,7 +451,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) {
   %offsets = arith.constant dense<[0]> : vector<1xindex>
   %mask = arith.constant dense<1>: vector<1xi1>
   // expected-error@+1 {{Mask should match value except the chunk size dim}}
-  xegpu.store %val, %src[%offsets], %mask 
+  xegpu.store %val, %src[%offsets], %mask
         : vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1>
   return
 }
@@ -870,14 +870,6 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) {
   return
 }
 
-// -----
-func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16>
-  return
-}
-
-
 // -----
 func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}}
@@ -900,16 +892,25 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 }
 
 // -----
-func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+        vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>
   return
 }
 
 // -----
-func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
+        vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>
   return
 }
 
+// -----
+func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+        vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>
+  return
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 27a3dc373c739..8946d14e80b72 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -265,3 +265,66 @@ gpu.module @xevm_module{
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
+// CHECK: %[[LAYOUT_X:.*]] = arith.constant 8 : index
+// CHECK: %[[LAYOUT_Y:.*]] = arith.constant 2 : index
+// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
+// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
+// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
+// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_Y]], %[[LAYOUT_Y]]
+// CHECK: %[[LANE_X_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[LAYOUT_X]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
+    %c0 = arith.constant 0 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
+// CHECK: %[[DIST_UNIT_HEIGHT_X:.*]] = arith.constant 4 : index
+// CHECK: %[[DIST_UNIT_HEIGHT_Y:.*]] = arith.constant 8 : index
+// CHECK: %[[LANE_DATA_Y:.*]] = arith.constant 2 : index
+// CHECK: %[[USER_OFFSET_X:.*]] = arith.constant 1 : index
+// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
+// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
+// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
+// CHECK: %[[LANE_Y_OFFSET_1:.*]] = index.mul %[[DELINEARIZED_LANE_Y]], %[[LANE_DATA_Y]]
+// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[LANE_Y_OFFSET_1]], %[[DIST_UNIT_HEIGHT_Y]]
+// CHECK: %[[LANE_X_OFFSET_1:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[DIST_UNIT_HEIGHT_X]]
+// CHECK: %[[LANE_X_OFFSET:.*]] = index.add %[[LANE_X_OFFSET_1]], %[[USER_OFFSET_X]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) {
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+      !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+      vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
+    gpu.return
+  }
+}
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 76d461108b296..93d51441f5b81 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -200,7 +200,8 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+    auto maybeOffsets =
+        sliceAttr.computeDistributedCoords(rewriter, loc, sgId, wgShape);
     if (failed(maybeOffsets))
       return failure();
 

From 346da3dfd3e70ca82a7df968c79666ff0b8c77c5 Mon Sep 17 00:00:00 2001
From: Kewen Meng <Kewen.Meng@amd.com>
Date: Mon, 3 Nov 2025 12:50:11 -0800
Subject: [PATCH 072/313] Revert "[libc] Add printf error handling" (#166232)

---
 libc/src/stdio/CMakeLists.txt                 | 24 ---------
 libc/src/stdio/asprintf.cpp                   | 18 +------
 libc/src/stdio/baremetal/CMakeLists.txt       |  8 ---
 libc/src/stdio/baremetal/printf.cpp           | 23 ++------
 libc/src/stdio/baremetal/vprintf.cpp          | 23 ++------
 libc/src/stdio/generic/CMakeLists.txt         |  4 --
 libc/src/stdio/generic/fprintf.cpp            | 17 +-----
 libc/src/stdio/generic/printf.cpp             | 17 +-----
 libc/src/stdio/generic/vfprintf.cpp           | 17 +-----
 libc/src/stdio/generic/vprintf.cpp            | 17 +-----
 libc/src/stdio/printf_core/CMakeLists.txt     | 25 ---------
 libc/src/stdio/printf_core/core_structs.h     | 19 +++----
 libc/src/stdio/printf_core/error_mapper.h     | 21 --------
 .../stdio/printf_core/generic/CMakeLists.txt  |  8 ---
 .../stdio/printf_core/generic/error_mapper.h  | 49 -----------------
 .../stdio/printf_core/linux/CMakeLists.txt    |  8 ---
 .../stdio/printf_core/linux/error_mapper.h    | 54 -------------------
 libc/src/stdio/printf_core/printf_main.h      |  9 ++--
 .../stdio/printf_core/vasprintf_internal.h    | 20 ++++---
 .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++---------
 .../stdio/printf_core/write_int_converter.h   |  4 +-
 libc/src/stdio/printf_core/writer.h           |  8 +--
 libc/src/stdio/snprintf.cpp                   | 19 +------
 libc/src/stdio/sprintf.cpp                    | 18 +------
 libc/src/stdio/vasprintf.cpp                  | 16 +-----
 libc/src/stdio/vsnprintf.cpp                  | 19 +------
 libc/src/stdio/vsprintf.cpp                   | 17 +-----
 libc/src/stdlib/CMakeLists.txt                |  6 ---
 libc/src/stdlib/strfromd.cpp                  | 10 +---
 libc/src/stdlib/strfromf.cpp                  | 10 +---
 libc/src/stdlib/strfroml.cpp                  | 10 +---
 libc/src/time/strftime_core/strftime_main.h   |  3 +-
 libc/test/src/stdio/CMakeLists.txt            |  2 -
 libc/test/src/stdio/fprintf_test.cpp          | 23 --------
 .../src/stdio/printf_core/converter_test.cpp  | 30 +++++------
 .../src/stdio/printf_core/writer_test.cpp     | 32 +++++------
 libc/test/src/stdio/snprintf_test.cpp         | 15 ------
 libc/test/src/stdio/vfprintf_test.cpp         |  5 --
 libc/test/src/stdlib/StrfromTest.h            | 19 +------
 39 files changed, 104 insertions(+), 584 deletions(-)
 delete mode 100644 libc/src/stdio/printf_core/error_mapper.h
 delete mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt
 delete mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h
 delete mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt
 delete mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h

diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index c75c8b11be2b5..b0a6ef1e291b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -125,10 +125,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -140,10 +136,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -154,10 +146,6 @@ add_entrypoint_object(
     asprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -169,10 +157,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -184,10 +168,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -198,10 +178,6 @@ add_entrypoint_object(
     vasprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_subdirectory(printf_core)
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index 083f40c1f19fa..f8cfb74ce48ea 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -7,12 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/asprintf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -26,18 +22,8 @@ LLVM_LIBC_FUNCTION(int, asprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  auto ret_val = printf_core::vasprintf_internal(buffer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  int ret = printf_core::vasprintf_internal(buffer, format, args);
+  return ret;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
index bfeff0e2b5880..548938f885c94 100644
--- a/libc/src/stdio/baremetal/CMakeLists.txt
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -29,12 +29,8 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -91,12 +87,8 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index 0c6c9ad338c9f..7253c6549a4e4 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -7,13 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/printf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -45,25 +42,13 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  auto retval = printf_core::printf_main(&writer, format, args);
-  if (!retval.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(retval.error());
-    return -1;
-  }
+  int retval = printf_core::printf_main(&writer, format, args);
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK) {
-    libc_errno = printf_core::internal_error_to_errno(-flushval);
-    return -1;
-  }
+  if (flushval != printf_core::WRITE_OK)
+    retval = flushval;
 
-  if (retval.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(retval.value());
+  return retval;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index d2f586c70ad1c..ab02533f14911 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -7,13 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vprintf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -43,25 +40,13 @@ LLVM_LIBC_FUNCTION(int, vprintf,
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  auto retval = printf_core::printf_main(&writer, format, args);
-  if (!retval.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(retval.error());
-    return -1;
-  }
+  int retval = printf_core::printf_main(&writer, format, args);
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK) {
-    libc_errno = printf_core::internal_error_to_errno(-flushval);
-    return -1;
-  }
+  if (flushval != printf_core::WRITE_OK)
+    retval = flushval;
 
-  if (retval.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(retval.value());
+  return retval;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 71055edea3d9e..6361822b61999 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -393,11 +393,7 @@ add_generic_entrypoint_object(
 list(APPEND fprintf_deps
       libc.hdr.types.FILE
       libc.src.__support.arg_list
-      libc.src.__support.CPP.limits
-      libc.src.__support.libc_errno
       libc.src.stdio.printf_core.vfprintf_internal
-      libc.src.stdio.printf_core.core_structs
-      libc.src.stdio.printf_core.error_mapper
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
index b07f2528fe11d..087aeadfc52c5 100644
--- a/libc/src/stdio/generic/fprintf.cpp
+++ b/libc/src/stdio/generic/fprintf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/fprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -30,18 +27,8 @@ LLVM_LIBC_FUNCTION(int, fprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  int ret_val = printf_core::vfprintf_internal(stream, format, args);
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp
index d6d4adcefb3b1..bb7c7c86f843f 100644
--- a/libc/src/stdio/generic/printf.cpp
+++ b/libc/src/stdio/generic/printf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/printf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -34,19 +31,9 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  auto ret_val = printf_core::vfprintf_internal(
+  int ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
index c00352d1dd666..01f4265f118a6 100644
--- a/libc/src/stdio/generic/vfprintf.cpp
+++ b/libc/src/stdio/generic/vfprintf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/vfprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -27,18 +24,8 @@ LLVM_LIBC_FUNCTION(int, vfprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  int ret_val = printf_core::vfprintf_internal(stream, format, args);
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp
index 1c0837fd5d441..08d71515646ed 100644
--- a/libc/src/stdio/generic/vprintf.cpp
+++ b/libc/src/stdio/generic/vprintf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/vprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -32,19 +29,9 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  auto ret_val = printf_core::vfprintf_internal(
+  int ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 2d1daea71406e..ee66145e60156 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -32,17 +32,6 @@ if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
-  add_subdirectory(${LIBC_TARGET_OS})
-else()
-  add_subdirectory(generic)
-endif()
-
-set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper)
-if(NOT TARGET ${target_error_converter})
-    set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper)
-endif()
-
 add_header_library(
   printf_config
   HDRS
@@ -58,7 +47,6 @@ add_header_library(
     libc.include.inttypes
     libc.src.__support.CPP.string_view
     libc.src.__support.FPUtil.fp_bits
-    libc.hdr.errno_macros
 )
 
 add_header_library(
@@ -137,7 +125,6 @@ add_header_library(
     .writer
     .core_structs
     libc.src.__support.arg_list
-    libc.src.__support.error_or
 )
 
 add_header_library(
@@ -149,20 +136,10 @@ add_header_library(
     libc.hdr.func.free
     libc.hdr.func.realloc
     libc.src.__support.arg_list
-    libc.src.__support.error_or
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
 )
 
-add_header_library(
-  error_mapper
-  HDRS
-    error_mapper.h
-  DEPENDS
-    ${target_error_mapper}
-    libc.src.__support.macros.properties.architectures
-)
-
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
   # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
@@ -175,10 +152,8 @@ add_header_library(
     vfprintf_internal.h
   DEPENDS
     libc.src.__support.File.file
-    libc.src.__support.error_or
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
   ${use_system_file}
 )
-
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 0d41f2244d8da..e27f77b6b594a 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -132,17 +132,14 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
 
 // This is the value to be returned by conversions when no error has occurred.
 constexpr int WRITE_OK = 0;
-// These are the error return values used by the printf engine when an
-// error has occurred. They are all large negative, distinct values starting
-// from -1000 to not overlap with system errors.
-constexpr int FILE_WRITE_ERROR = -1001;
-constexpr int FILE_STATUS_ERROR = -1002;
-constexpr int NULLPTR_WRITE_ERROR = -1003;
-constexpr int INT_CONVERSION_ERROR = -1004;
-constexpr int FIXED_POINT_CONVERSION_ERROR = -1005;
-constexpr int ALLOCATION_ERROR = -1006;
-constexpr int OVERFLOW_ERROR = -1007;
-
+// These are the printf return values for when an error has occurred. They are
+// all negative, and should be distinct.
+constexpr int FILE_WRITE_ERROR = -1;
+constexpr int FILE_STATUS_ERROR = -2;
+constexpr int NULLPTR_WRITE_ERROR = -3;
+constexpr int INT_CONVERSION_ERROR = -4;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
+constexpr int ALLOCATION_ERROR = -6;
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h
deleted file mode 100644
index 23030930133a1..0000000000000
--- a/libc/src/stdio/printf_core/error_mapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- Error mapper for printf ---------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
-#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
-
-#include "src/__support/macros/properties/architectures.h"
-
-// Maps internal errors to the available errnos on the platform.
-#if defined(__linux__)
-#include "linux/error_mapper.h"
-#else
-#include "generic/error_mapper.h"
-#endif
-
-#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt
deleted file mode 100644
index 2f0143d992e31..0000000000000
--- a/libc/src/stdio/printf_core/generic/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_header_library(
-  error_mapper
-  HDRS
-    error_mapper.h
-  DEPENDS
-    libc.src.stdio.printf_core.core_structs
-    libc.hdr.errno_macros
-)
diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h
deleted file mode 100644
index d8cdd2cc2dbaa..0000000000000
--- a/libc/src/stdio/printf_core/generic/error_mapper.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- Generic implementation of error mapper ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
-#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
-
-#include "hdr/errno_macros.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace printf_core {
-
-LIBC_INLINE static int internal_error_to_errno(int internal_error) {
-  // System error occured, return error as is.
-  if (internal_error < 1001 && internal_error > 0) {
-    return internal_error;
-  }
-
-  // Map internal error to the available C standard errnos.
-  switch (-internal_error) {
-  case WRITE_OK:
-    return 0;
-  case FILE_WRITE_ERROR:
-  case FILE_STATUS_ERROR:
-  case NULLPTR_WRITE_ERROR:
-  case ALLOCATION_ERROR:
-    return EDOM;
-  case INT_CONVERSION_ERROR:
-  case FIXED_POINT_CONVERSION_ERROR:
-  case OVERFLOW_ERROR:
-    return ERANGE;
-  default:
-    LIBC_ASSERT(
-        false &&
-        "Invalid internal printf error code passed to internal_error_to_errno");
-    return EDOM;
-  }
-}
-
-} // namespace printf_core
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt
deleted file mode 100644
index 2f0143d992e31..0000000000000
--- a/libc/src/stdio/printf_core/linux/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_header_library(
-  error_mapper
-  HDRS
-    error_mapper.h
-  DEPENDS
-    libc.src.stdio.printf_core.core_structs
-    libc.hdr.errno_macros
-)
diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h
deleted file mode 100644
index 3c2fe663072d0..0000000000000
--- a/libc/src/stdio/printf_core/linux/error_mapper.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- Linux implementation of error mapper --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
-#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
-
-#include "hdr/errno_macros.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace printf_core {
-
-LIBC_INLINE static int internal_error_to_errno(int internal_error) {
-  // System error occured, return error as is.
-  if (internal_error < 1001 && internal_error > 0) {
-    return internal_error;
-  }
-
-  // Map internal error to POSIX errnos.
-  switch (-internal_error) {
-  case WRITE_OK:
-    return 0;
-  case FILE_WRITE_ERROR:
-    return EIO;
-  case FILE_STATUS_ERROR:
-    return EIO;
-  case NULLPTR_WRITE_ERROR:
-    return EINVAL;
-  case INT_CONVERSION_ERROR:
-    return ERANGE;
-  case FIXED_POINT_CONVERSION_ERROR:
-    return EINVAL;
-  case ALLOCATION_ERROR:
-    return ENOMEM;
-  case OVERFLOW_ERROR:
-    return EOVERFLOW;
-  default:
-    LIBC_ASSERT(
-        false &&
-        "Invalid internal printf error code passed to internal_error_to_errno");
-    return EINVAL;
-  }
-}
-
-} // namespace printf_core
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h
index 1c7a7237c097d..57f29858d5298 100644
--- a/libc/src/stdio/printf_core/printf_main.h
+++ b/libc/src/stdio/printf_core/printf_main.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H
 
 #include "src/__support/arg_list.h"
-#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -23,9 +22,8 @@ namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 template <WriteMode write_mode>
-ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
-                            const char *__restrict str,
-                            internal::ArgList &args) {
+int printf_main(Writer<write_mode> *writer, const char *__restrict str,
+                internal::ArgList &args) {
   Parser<internal::ArgList> parser(str, args);
   int result = 0;
   for (FormatSection cur_section = parser.get_next_section();
@@ -35,8 +33,9 @@ ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
       result = convert(writer, cur_section);
     else
       result = writer->write(cur_section.raw_string);
+
     if (result < 0)
-      return Error(-result);
+      return result;
   }
 
   return writer->get_chars_written();
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
index 41df17b67f35b..283d8df2810fb 100644
--- a/libc/src/stdio/printf_core/vasprintf_internal.h
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -10,7 +10,6 @@
 #include "hdr/func/malloc.h"
 #include "hdr/func/realloc.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/error_or.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
@@ -30,7 +29,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
   if (new_buff == nullptr) {
     if (wb->buff != wb->init_buff)
       free(wb->buff);
-    return ALLOCATION_ERROR;
+    return printf_core::ALLOCATION_ERROR;
   }
   if (isBuffOnStack)
     inline_memcpy(new_buff, wb->buff, wb->buff_cur);
@@ -43,28 +42,27 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
 
 constexpr size_t DEFAULT_BUFFER_SIZE = 200;
 
-LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret,
-                                               const char *__restrict format,
-                                               internal::ArgList args) {
+LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format,
+                                   internal::ArgList args) {
   char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb(
       init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook);
   printf_core::Writer writer(wb);
 
   auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
+  if (ret_val < 0) {
     *ret = nullptr;
-    return ret_val;
+    return -1;
   }
   if (wb.buff == init_buff_on_stack) {
-    *ret = static_cast<char *>(malloc(ret_val.value() + 1));
+    *ret = static_cast<char *>(malloc(ret_val + 1));
     if (ret == nullptr)
-      return Error(ALLOCATION_ERROR);
-    inline_memcpy(*ret, wb.buff, ret_val.value());
+      return printf_core::ALLOCATION_ERROR;
+    inline_memcpy(*ret, wb.buff, ret_val);
   } else {
     *ret = wb.buff;
   }
-  (*ret)[ret_val.value()] = '\0';
+  (*ret)[ret_val] = '\0';
   return ret_val;
 }
 } // namespace printf_core
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index 564441d3bf51a..630de9d9d43dd 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/error_or.h"
 #include "src/__support/macros/attributes.h" // For LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -36,8 +35,8 @@ LIBC_INLINE void funlockfile(FILE *f) {
   reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock();
 }
 
-LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
-                                         size_t nmemb, FILE *f) {
+LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
+                                   FILE *f) {
   return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked(
       ptr, size * nmemb);
 }
@@ -48,11 +47,9 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); }
 
 LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); }
 
-LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
-                                         size_t nmemb, ::FILE *f) {
-  // Need to use system errno in this case, as system write will set this errno
-  // which we need to propagate back into our code.
-  return {::fwrite_unlocked(ptr, size, nmemb, f), errno};
+LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
+                                   ::FILE *f) {
+  return ::fwrite_unlocked(ptr, size, nmemb, f);
 }
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace internal
@@ -63,38 +60,26 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) {
   ::FILE *target_file = reinterpret_cast<::FILE *>(fp);
   // Write new_str to the target file. The logic preventing a zero-length write
   // is in the writer, so we don't check here.
-  auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char),
-                                                new_str.size(), target_file);
-  // Propagate actual system error in FileIOResult.
-  if (write_result.has_error())
-    return -write_result.error;
-
-  // In case short write occured or error was not set on FileIOResult for some
-  // reason.
-  if (write_result.value != new_str.size() ||
-      internal::ferror_unlocked(target_file))
+  size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char),
+                                             new_str.size(), target_file);
+  if (written != new_str.size() || internal::ferror_unlocked(target_file))
     return FILE_WRITE_ERROR;
-
   return WRITE_OK;
 }
 
-LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream,
-                                              const char *__restrict format,
-                                              internal::ArgList &args) {
+LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
+                                  const char *__restrict format,
+                                  internal::ArgList &args) {
   constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb(
       buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream));
   Writer writer(wb);
   internal::flockfile(stream);
-  auto retval = printf_main(&writer, format, args);
-  if (!retval.has_value()) {
-    internal::funlockfile(stream);
-    return retval;
-  }
+  int retval = printf_main(&writer, format, args);
   int flushval = wb.overflow_write("");
   if (flushval != WRITE_OK)
-    retval = Error(-flushval);
+    retval = flushval;
   internal::funlockfile(stream);
   return retval;
 }
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index 04b2bef05bc7b..efcff278bd284 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer,
     return NULLPTR_WRITE_ERROR;
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
-  size_t written = writer->get_chars_written();
+  int written = writer->get_chars_written();
 
   switch (to_conv.length_modifier) {
   case LengthModifier::none:
-    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written);
+    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written;
     break;
   case LengthModifier::l:
     *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written;
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 9de108ece510f..1d4734a51b9b8 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer {
 
 template <WriteMode write_mode> class Writer final {
   WriteBuffer<write_mode> &wb;
-  size_t chars_written = 0;
+  int chars_written = 0;
 
   LIBC_INLINE int pad(char new_char, size_t length) {
     // First, fill as much of the buffer as possible with the padding char.
@@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final {
   // Takes a string, copies it into the buffer if there is space, else passes it
   // to the overflow mechanism to be handled separately.
   LIBC_INLINE int write(cpp::string_view new_string) {
-    chars_written += new_string.size();
+    chars_written += static_cast<int>(new_string.size());
     if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) {
       inline_memcpy(wb.buff + wb.buff_cur, new_string.data(),
                     new_string.size());
@@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final {
   // if there is space, else calls pad which will loop and call the overflow
   // mechanism on a secondary buffer.
   LIBC_INLINE int write(char new_char, size_t length) {
-    chars_written += length;
+    chars_written += static_cast<int>(length);
 
     if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) {
       inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char),
@@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final {
     return wb.overflow_write(char_string_view);
   }
 
-  LIBC_INLINE size_t get_chars_written() { return chars_written; }
+  LIBC_INLINE int get_chars_written() { return chars_written; }
 };
 
 // Class-template auto deduction helpers.
diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp
index 206c50d1b41a4..c8940862f711f 100644
--- a/libc/src/stdio/snprintf.cpp
+++ b/libc/src/stdio/snprintf.cpp
@@ -8,12 +8,8 @@
 
 #include "src/stdio/snprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -36,21 +32,10 @@ LLVM_LIBC_FUNCTION(int, snprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp
index 9e9ecdfdf38cc..7be97d3591aaf 100644
--- a/libc/src/stdio/sprintf.cpp
+++ b/libc/src/stdio/sprintf.cpp
@@ -10,10 +10,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -36,20 +33,9 @@ LLVM_LIBC_FUNCTION(int, sprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
index 4bc6a5992d5c8..4a44d4a0f8842 100644
--- a/libc/src/stdio/vasprintf.cpp
+++ b/libc/src/stdio/vasprintf.cpp
@@ -7,11 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vasprintf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -22,17 +18,7 @@ LLVM_LIBC_FUNCTION(int, vasprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  auto ret_val = printf_core::vasprintf_internal(ret, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(ret_val.value());
+  return printf_core::vasprintf_internal(ret, format, args);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp
index ba3568fbe78b3..b07a2499a0dd3 100644
--- a/libc/src/stdio/vsnprintf.cpp
+++ b/libc/src/stdio/vsnprintf.cpp
@@ -8,12 +8,8 @@
 
 #include "src/stdio/vsnprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,21 +29,10 @@ LLVM_LIBC_FUNCTION(int, vsnprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp
index 65c223a442d4b..26d497be42125 100644
--- a/libc/src/stdio/vsprintf.cpp
+++ b/libc/src/stdio/vsprintf.cpp
@@ -10,10 +10,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,19 +30,9 @@ LLVM_LIBC_FUNCTION(int, vsprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 1ccdcc8bec148..c464f82dcbda7 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -73,8 +73,6 @@ add_entrypoint_object(
     strfromf.h
   DEPENDS
     .str_from_util
-    libc.src.__support.CPP.limits
-    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -85,8 +83,6 @@ add_entrypoint_object(
     strfromd.h
   DEPENDS
     .str_from_util
-    libc.src.__support.CPP.limits
-    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -97,8 +93,6 @@ add_entrypoint_object(
     strfroml.h
   DEPENDS
     .str_from_util
-    libc.src.__support.CPP.limits
-    libc.src.stdio.printf_core.error_mapper
 )
 
 add_header_library(
diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp
index f970e22010201..f51e6d4c7f1df 100644
--- a/libc/src/stdlib/strfromd.cpp
+++ b/libc/src/stdlib/strfromd.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromd.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -39,12 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromd,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(writer.get_chars_written());
+  return writer.get_chars_written();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp
index 55ede003134b5..14dbfdb25bab6 100644
--- a/libc/src/stdlib/strfromf.cpp
+++ b/libc/src/stdlib/strfromf.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -39,12 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromf,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(writer.get_chars_written());
+  return writer.get_chars_written();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp
index 37d18738902bc..12f22a8a2fb65 100644
--- a/libc/src/stdlib/strfroml.cpp
+++ b/libc/src/stdlib/strfroml.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfroml.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -44,12 +41,7 @@ LLVM_LIBC_FUNCTION(int, strfroml,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  if (writer.get_chars_written() > cpp::numeric_limits<int>::max()) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(writer.get_chars_written());
+  return writer.get_chars_written();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h
index 2b136d83234cd..c7e590627094a 100644
--- a/libc/src/time/strftime_core/strftime_main.h
+++ b/libc/src/time/strftime_core/strftime_main.h
@@ -36,8 +36,7 @@ int strftime_main(printf_core::Writer<write_mode> *writer,
       return result;
   }
 
-  // TODO: Use ErrorOr<size_t>
-  return static_cast<int>(writer->get_chars_written());
+  return writer->get_chars_written();
 }
 
 } // namespace strftime_core
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index d71f1dff11943..eec108bc12ca5 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -186,8 +186,6 @@ add_libc_test(
     fprintf_test.cpp
   DEPENDS
     libc.src.stdio.fprintf
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
     ${fprintf_test_deps}
   COMPILE_OPTIONS
     ${use_system_file}
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 7d36bd30854b8..6799323cc6ad9 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -15,9 +15,6 @@
 
 #include "src/stdio/fprintf.h"
 
-#include "src/__support/CPP/limits.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -34,8 +31,6 @@ using ::fread;
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace printf_test
 
-using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -83,24 +78,6 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written =
       LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
-  ASSERT_ERRNO_EQ(EBADF);
-
-  ASSERT_EQ(printf_test::fclose(file), 0);
-}
-
-#ifndef LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
-TEST(LlvmLibcFPrintfTest, NullPtrCheck) {
-  const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test");
-  auto FILE_PATH = libc_make_test_file_path(FILENAME);
-
-  ::FILE *file = printf_test::fopen(FILE_PATH, "w");
-  ASSERT_FALSE(file == nullptr);
-
-  int ret =
-      LIBC_NAMESPACE::fprintf(file, "hello %s", static_cast<int *>(nullptr));
-  EXPECT_LT(ret, 0);
-  ASSERT_ERRNO_EQ(EINVAL);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
-#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index 2dae2a22c864c..bf088937e4104 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "abc");
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "%");
-  ASSERT_EQ(writer.get_chars_written(), size_t{1});
+  ASSERT_EQ(writer.get_chars_written(), 1);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "D");
-  ASSERT_EQ(writer.get_chars_written(), size_t{1});
+  ASSERT_EQ(writer.get_chars_written(), 1);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
@@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "   E");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "F   ");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
@@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "DEF");
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
@@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "456");
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
@@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "xy");
-  ASSERT_EQ(writer.get_chars_written(), size_t{2});
+  ASSERT_EQ(writer.get_chars_written(), 2);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
@@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, " 789");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
@@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "ghi ");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
@@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "12345");
-  ASSERT_EQ(writer.get_chars_written(), size_t{5});
+  ASSERT_EQ(writer.get_chars_written(), 5);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
@@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x00000000123456ab");
-  ASSERT_EQ(writer.get_chars_written(), size_t{18});
+  ASSERT_EQ(writer.get_chars_written(), 18);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
@@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "101010");
-  ASSERT_EQ(writer.get_chars_written(), size_t{6});
+  ASSERT_EQ(writer.get_chars_written(), 6);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
@@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x123456ab");
-  ASSERT_EQ(writer.get_chars_written(), size_t{10});
+  ASSERT_EQ(writer.get_chars_written(), 10);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
@@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "1234");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp
index d263cf55aa474..d036341be7981 100644
--- a/libc/test/src/stdio/printf_core/writer_test.cpp
+++ b/libc/test/src/stdio/printf_core/writer_test.cpp
@@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abc", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
@@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF123", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{9});
+  ASSERT_EQ(writer.get_chars_written(), 9);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteChars) {
@@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaa", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
@@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDDD111", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{9});
+  ASSERT_EQ(writer.get_chars_written(), 9);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
@@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
                "ZZZZZZZZZZ"
                "ZZZZZZZZZ",
                str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{99});
+  ASSERT_EQ(writer.get_chars_written(), 99);
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
@@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
@@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF1234", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
@@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("1111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{15});
+  ASSERT_EQ(writer.get_chars_written(), 15);
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
@@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF1114", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
@@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
@@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
   writer.write('1', 3);
   writer.write({"456", 3});
 
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 struct OutBuff {
@@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("abcDEF123456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
@@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("111111111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{15});
+  ASSERT_EQ(writer.get_chars_written(), 15);
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
@@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
@@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
@@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
   wb.overflow_write("");
   str[out_buff.cur_pos] = '\0';
 
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
   ASSERT_STREQ("aaaDEF111456", str);
 }
 
diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp
index 1062f952d7429..baaa664cdc9ee 100644
--- a/libc/test/src/stdio/snprintf_test.cpp
+++ b/libc/test/src/stdio/snprintf_test.cpp
@@ -8,12 +8,8 @@
 
 #include "src/stdio/snprintf.h"
 
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
 // The sprintf test cases cover testing the shared printf functionality, so
 // these tests will focus on snprintf exclusive features.
 
@@ -63,14 +59,3 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) {
   EXPECT_EQ(written, 10);
   ASSERT_STREQ(buff, "1234567890");
 }
-
-TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) {
-  char buff[0];
-
-  // Trigger an overflow in the return value of snprintf by writing more than
-  // INT_MAX bytes.
-  int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max();
-  int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, "");
-  EXPECT_LT(written, 0);
-  ASSERT_ERRNO_EQ(EOVERFLOW);
-}
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index 9b5f09db8fd41..f50565a0f68ca 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -19,8 +19,6 @@
 
 #include "src/stdio/vfprintf.h"
 
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -46,8 +44,6 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format,
   return ret;
 }
 
-using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
 TEST(LlvmLibcVFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -94,7 +90,6 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) {
 
   written = call_vfprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
-  ASSERT_ERRNO_EQ(EBADF);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h
index fdeed0e3c06f5..e82c94499aa11 100644
--- a/libc/test/src/stdlib/StrfromTest.h
+++ b/libc/test/src/stdlib/StrfromTest.h
@@ -8,8 +8,6 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str)             \
@@ -17,7 +15,7 @@
   EXPECT_STREQ(actual_str, expected_str);
 
 template <typename InputT>
-class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class StrfromTest : public LIBC_NAMESPACE::testing::Test {
 
   static constexpr bool is_single_prec =
       LIBC_NAMESPACE::cpp::is_same<InputT, float>::value;
@@ -483,16 +481,6 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     written = func(buff, 10, "%A", -ld_nan);
     ASSERT_STREQ_LEN(written, buff, "-NAN");
   }
-
-  void charsWrittenOverflow(FunctionT func) {
-    char buff[100];
-    // Trigger an overflow in the return value of strfrom by writing more than
-    // INT_MAX bytes.
-    int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f);
-
-    EXPECT_LT(result, 0);
-    ASSERT_ERRNO_EQ(EOVERFLOW);
-  }
 };
 
 #define STRFROM_TEST(InputType, name, func)                                    \
@@ -513,7 +501,4 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
   TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) {                       \
     insufficentBufsize(func);                                                  \
   }                                                                            \
-  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }        \
-  TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) {                         \
-    charsWrittenOverflow(func);                                                \
-  }
+  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }

From 96f093770d21b8bbe05a6b8ff744ce9fdd6e4c50 Mon Sep 17 00:00:00 2001
From: Roy Shi <royitaqi@users.noreply.github.com>
Date: Mon, 3 Nov 2025 13:04:34 -0800
Subject: [PATCH 073/313] [dsymutil] Add option to copy swiftmodules built from
 interface (#165293)

The default behavior is to _not_ copy such swiftmodules into the dSYM,
as perviously implemented in 96f95c9d89d8a1784d3865fa941fb1c510f4e2d7.
This patch adds the option to override the behavior, so that such
swiftmodules can be copied into the dSYM.

This is useful when the dSYM will be used on a machine which has a
different Xcode/SDK than where the swiftmodules were built. Without
this, when LLDB is asked to "p/po" a Swift variable, the underlying
Swift compiler code would rebuild the dependent `.swiftmodule` files of
the Swift stdlibs, which takes ~1 minute in some cases.

See PR for tests.
---
 llvm/docs/CommandGuide/dsymutil.rst           |  8 +++++
 .../swiftmodule-include-from-interface.test   | 33 +++++++++++++++++++
 llvm/test/tools/dsymutil/cmdline.test         |  1 +
 llvm/tools/dsymutil/DwarfLinkerForBinary.cpp  |  7 ++--
 llvm/tools/dsymutil/LinkUtils.h               |  7 ++++
 llvm/tools/dsymutil/Options.td                |  8 +++++
 llvm/tools/dsymutil/dsymutil.cpp              |  3 ++
 7 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test

diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index 8e61e01d7d9c3..0e442d657e987 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -70,6 +70,14 @@ OPTIONS
 
  Print this help output.
 
+.. option:: --include-swiftmodules-from-interface
+
+ Whether or not to copy binary swiftmodules built from textual .swiftinterface
+ files into the dSYM bundle. These typically come only from the SDK (since
+ textual interfaces require library evolution) and thus are a waste of space to
+ copy into the bundle. Turn this on if the swiftmodules are different from
+ those in the SDK.
+
 .. option:: --keep-function-for-static
 
  Make a static variable keep the enclosing function even if it would have been
diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
new file mode 100644
index 0000000000000..00141f12587d4
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
@@ -0,0 +1,33 @@
+# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM  %s | FileCheck %s
+#
+# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s
+#
+# To regenerate:
+# echo ''>I.swift
+# echo ''>B.swift
+# echo 'import I'>main.swift
+# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift
+# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options
+# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend  -no-serialize-debugging-options
+# output is "B.swiftmodule" and "cache/I*.swiftmodule"
+#
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule
+
+#
+---
+triple:          'arm64-apple-darwin'
+objects:
+  - filename:        '../Inputs/Binary.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+  - filename:        '../Inputs/FromInterface.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+  - filename:        '../Inputs/FromInterface.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+...
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 1574fe35f5254..0b0bce194d575 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -14,6 +14,7 @@ CHECK: -fat64
 CHECK: -flat
 CHECK: -gen-reproducer
 CHECK: -help
+CHECK: -include-swiftmodules-from-interface
 CHECK: -keep-function-for-static
 CHECK: -no-object-timestamp
 CHECK: -no-odr
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index b91c27e6a0f86..ee1e9060657b0 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -794,9 +794,10 @@ bool DwarfLinkerForBinary::linkImpl(
         reportWarning("Could not parse binary Swift module: " +
                           toString(FromInterfaceOrErr.takeError()),
                       Obj->getObjectFilename());
-        // Only skip swiftmodules that could be parsed and are
-        // positively identified as textual.
-      } else if (*FromInterfaceOrErr) {
+        // Only skip swiftmodules that could be parsed and are positively
+        // identified as textual. Do so only when the option allows.
+      } else if (*FromInterfaceOrErr &&
+                 !Options.IncludeSwiftModulesFromInterface) {
         if (Options.Verbose)
           outs() << "Skipping compiled textual Swift interface: "
                  << Obj->getObjectFilename() << "\n";
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index ad5515a04333e..c333a3d4afee0 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -114,6 +114,13 @@ struct LinkOptions {
   /// Whether all remarks should be kept or only remarks with valid debug
   /// locations.
   bool RemarksKeepAll = true;
+
+  /// Whether or not to copy binary swiftmodules built from textual
+  /// .swiftinterface files into the dSYM bundle. These typically come only
+  /// from the SDK (since textual interfaces require library evolution) and
+  /// thus are a waste of space to copy into the bundle. Turn this on if the
+  /// swiftmodules are different from those in the SDK.
+  bool IncludeSwiftModulesFromInterface = false;
   /// @}
 
   LinkOptions() = default;
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index ad35e55e33b12..e99bc12fa7fd8 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -202,6 +202,14 @@ def remarks_drop_without_debug: Flag<["--", "-"], "remarks-drop-without-debug">,
            "all remarks are kept.">,
   Group<grp_general>;
 
+def include_swiftmodules_from_interface: Flag<["--", "-"], "include-swiftmodules-from-interface">,
+  HelpText<"Whether or not to copy binary swiftmodules built from textual "
+  ".swiftinterface files into the dSYM bundle. These typically come only "
+  "from the SDK (since textual interfaces require library evolution) and "
+  "thus are a waste of space to copy into the bundle. Turn this on if the "
+  "swiftmodules are different from those in the SDK.">,
+  Group<grp_general>;
+
 def linker: Separate<["--", "-"], "linker">,
   MetaVarName<"<DWARF linker type>">,
   HelpText<"Specify the desired type of DWARF linker. Defaults to 'classic'">,
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 913077eb0b06d..688f6aaf3d0c9 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -391,6 +391,9 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
   Options.LinkOpts.RemarksKeepAll =
       !Args.hasArg(OPT_remarks_drop_without_debug);
 
+  Options.LinkOpts.IncludeSwiftModulesFromInterface =
+      Args.hasArg(OPT_include_swiftmodules_from_interface);
+
   if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix))
     Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue();
 

From 9ff31be2f2c181d81bbafa927e6aca117fd68330 Mon Sep 17 00:00:00 2001
From: Michael Spencer <bigcheesegs@gmail.com>
Date: Mon, 3 Nov 2025 13:12:55 -0800
Subject: [PATCH 074/313] [clang][builtins] Add stdckdint.h to the modulemap.
 (#166230)

All builtin Clang headers need to be covered by the modulemap.

This fixes https://github.com/llvm/llvm-project/issues/166173
---
 clang/lib/Headers/module.modulemap                           | 5 +++++
 clang/lib/Lex/ModuleMap.cpp                                  | 1 +
 .../Modules/Inputs/builtin-headers/system-modules.modulemap  | 5 +++++
 clang/test/Modules/builtin-headers.mm                        | 1 +
 4 files changed, 12 insertions(+)

diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 2e4d533356569..c13dd3fd48ac8 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -253,6 +253,11 @@ module _Builtin_stdbool [system] {
   export *
 }
 
+module _Builtin_stdckdint [system] {
+  header "stdckdint.h"
+  export *
+}
+
 module _Builtin_stdcountof [system] {
   header "stdcountof.h"
   export *
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 637a08fe4dcdb..b8202ea11be36 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -258,6 +258,7 @@ static bool isBuiltinHeaderName(StringRef FileName) {
            .Case("stdarg.h", true)
            .Case("stdatomic.h", true)
            .Case("stdbool.h", true)
+           .Case("stdckdint.h", true)
            .Case("stdcountof.h", true)
            .Case("stddef.h", true)
            .Case("stdint.h", true)
diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
index 186965177caaf..8ab6ae4779ea9 100644
--- a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
+++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
@@ -49,6 +49,11 @@ module cstd [system] [no_undeclared_includes] {
     export *
   }
 
+  module stdckdint {
+    header "stdckdint.h"
+    export *
+  }
+
   module stdcountof {
     header "stdcountof.h"
     export *
diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm
index ad2d66ae38dfd..6cd366228172e 100644
--- a/clang/test/Modules/builtin-headers.mm
+++ b/clang/test/Modules/builtin-headers.mm
@@ -17,6 +17,7 @@
 @import _Builtin_stdarg;
 @import _Builtin_stdatomic;
 @import _Builtin_stdbool;
+@import _Builtin_stdckdint;
 @import _Builtin_stdcountof;
 @import _Builtin_stddef;
 @import _Builtin_stdint;

From e5d9644bca214587a22401693eee90b2934d5827 Mon Sep 17 00:00:00 2001
From: "Deric C." <cheung.deric@gmail.com>
Date: Mon, 3 Nov 2025 13:13:11 -0800
Subject: [PATCH 075/313] [NFC] [DirectX] Fix warning about parentheses for
 assertion in DXContainerGlobals.cpp (#166231)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR fixes the appearance of the following warning message when
building LLVM with clang (21.1.2)
```
[48/100] Building CXX object lib/Target/DirectX/CMakeFiles/LLVMDirectXCodeGen.dir/DXContainerGlobals.cpp.o
In file included from /nix/store/ffrg0560kj0066s4k9pznjand907nlnz-gcc-14.3.0/include/c++/14.3.0/cassert:44,
                 from /workspace/llvm-project/llvm/include/llvm/Support/Endian.h:19,
                 from /workspace/llvm-project/llvm/include/llvm/Support/MD5.h:33,
                 from /workspace/llvm-project/llvm/lib/Target/DirectX/DXContainerGlobals.cpp:28:
/workspace/llvm-project/llvm/lib/Target/DirectX/DXContainerGlobals.cpp: In lambda function:
/workspace/llvm-project/llvm/lib/Target/DirectX/DXContainerGlobals.cpp:198:78: warning: suggest parentheses around ‘&&’ within ‘||’ [-Wparentheses]
  198 |                (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
      |                ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~
  199 |                    "Resource range is too large");
      |                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 ```

 I marked this PR as an NFC because it only modifies an assertion condition to remove a compiler warning.
---
 llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 8ace2d2777c74..eb4c8846441a2 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -194,9 +194,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         dxbc::PSV::v2::ResourceBindInfo BindInfo;
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
-        assert(Binding.Size == UINT32_MAX ||
-               (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
-                   "Resource range is too large");
+        assert(
+            (Binding.Size == UINT32_MAX ||
+             (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) &&
+            "Resource range is too large");
         BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
                                   ? UINT32_MAX
                                   : Binding.LowerBound + Binding.Size - 1;

From 25da15f9b1f910cb5d350381c1c8a287cbe4cf0f Mon Sep 17 00:00:00 2001
From: Julian Lettner <yln@users.noreply.github.com>
Date: Mon, 3 Nov 2025 13:29:34 -0800
Subject: [PATCH 076/313] [lldb] Fix indentation when printing stop hooks
 (#165945)

This commit aggregates the following changes:
1. Fix the format (i.e., indentation) when printing stop hooks via `target
   stop-hook list`.
2. Add `IndentScope Stream::MakeIndentScope()` to make managing (and restoring!)
   of the indentation level on `Stream` instances more ergonomic and less error
   prone.
3. Simplify printing of stop hooks using the new `IndentScope`.
---
 lldb/include/lldb/Utility/Stream.h            | 26 ++++++++++----
 lldb/source/Target/Target.cpp                 | 19 ++++------
 lldb/source/Utility/Stream.cpp                |  8 +++++
 .../StopHook/stop-hook-list-format.test       | 36 +++++++++++++++++++
 4 files changed, 70 insertions(+), 19 deletions(-)
 create mode 100644 lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test

diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h
index 82774d56922a9..13455552131da 100644
--- a/lldb/include/lldb/Utility/Stream.h
+++ b/lldb/include/lldb/Utility/Stream.h
@@ -300,6 +300,12 @@ class Stream {
   ///     The current indentation level.
   unsigned GetIndentLevel() const;
 
+  /// Set the current indentation level.
+  ///
+  /// \param[in] level
+  ///     The new indentation level.
+  void SetIndentLevel(unsigned level);
+
   /// Indent the current line in the stream.
   ///
   /// Indent the current line using the current indentation level and print an
@@ -315,6 +321,20 @@ class Stream {
   /// Increment the current indentation level.
   void IndentMore(unsigned amount = 2);
 
+  struct IndentScope {
+    IndentScope(Stream &stream)
+        : m_stream(stream), m_original_indent_level(stream.GetIndentLevel()) {}
+    ~IndentScope() { m_stream.SetIndentLevel(m_original_indent_level); }
+
+  private:
+    Stream &m_stream;
+    unsigned m_original_indent_level;
+  };
+
+  /// Create an indentation scope that restores the original indent level when
+  /// the object goes out of scope (RAII).
+  IndentScope MakeIndentScope(unsigned indent_amount = 2);
+
   /// Output an offset value.
   ///
   /// Put an offset \a uval out to the stream using the printf format in \a
@@ -364,12 +384,6 @@ class Stream {
   ///     address and pointer values.
   void SetAddressByteSize(uint32_t addr_size);
 
-  /// Set the current indentation level.
-  ///
-  /// \param[in] level
-  ///     The new indentation level.
-  void SetIndentLevel(unsigned level);
-
   /// Output a SLEB128 number to the stream.
   ///
   /// Put an SLEB128 \a uval out to the stream using the printf format in \a
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 1e43094421f0a..a23091ad09c6d 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3962,9 +3962,7 @@ void Target::StopHook::GetDescription(Stream &s,
     return;
   }
 
-  unsigned indent_level = s.GetIndentLevel();
-
-  s.SetIndentLevel(indent_level + 2);
+  auto indent_scope = s.MakeIndentScope();
 
   s.Printf("Hook: %" PRIu64 "\n", GetID());
   if (m_active)
@@ -3978,19 +3976,17 @@ void Target::StopHook::GetDescription(Stream &s,
   if (m_specifier_sp) {
     s.Indent();
     s.PutCString("Specifier:\n");
-    s.SetIndentLevel(indent_level + 4);
+    auto indent_scope = s.MakeIndentScope();
     m_specifier_sp->GetDescription(&s, level);
-    s.SetIndentLevel(indent_level + 2);
   }
 
   if (m_thread_spec_up) {
     StreamString tmp;
     s.Indent("Thread:\n");
     m_thread_spec_up->GetDescription(&tmp, level);
-    s.SetIndentLevel(indent_level + 4);
+    auto indent_scope = s.MakeIndentScope();
     s.Indent(tmp.GetString());
     s.PutCString("\n");
-    s.SetIndentLevel(indent_level + 2);
   }
   GetSubclassDescription(s, level);
 }
@@ -4003,14 +3999,13 @@ void Target::StopHookCommandLine::GetSubclassDescription(
       s.PutCString(m_commands.GetStringAtIndex(0));
     return;
   }
-  s.Indent("Commands: \n");
-  s.SetIndentLevel(s.GetIndentLevel() + 4);
+  s.Indent("Commands:\n");
+  auto indent_scope = s.MakeIndentScope(4);
   uint32_t num_commands = m_commands.GetSize();
   for (uint32_t i = 0; i < num_commands; i++) {
     s.Indent(m_commands.GetStringAtIndex(i));
     s.PutCString("\n");
   }
-  s.SetIndentLevel(s.GetIndentLevel() - 4);
 }
 
 // Target::StopHookCommandLine
@@ -4145,7 +4140,7 @@ void Target::StopHookScripted::GetSubclassDescription(
     return;
 
   s.Indent("Args:\n");
-  s.SetIndentLevel(s.GetIndentLevel() + 4);
+  auto indent_scope = s.MakeIndentScope(4);
 
   auto print_one_element = [&s](llvm::StringRef key,
                                 StructuredData::Object *object) {
@@ -4155,8 +4150,6 @@ void Target::StopHookScripted::GetSubclassDescription(
   };
 
   as_dict->ForEach(print_one_element);
-
-  s.SetIndentLevel(s.GetIndentLevel() - 4);
 }
 
 static constexpr OptionEnumValueElement g_dynamic_value_types[] = {
diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp
index 89dce9fb0e1f7..e9632c3e1fc1f 100644
--- a/lldb/source/Utility/Stream.cpp
+++ b/lldb/source/Utility/Stream.cpp
@@ -202,6 +202,14 @@ void Stream::IndentLess(unsigned amount) {
     m_indent_level = 0;
 }
 
+// Create an indentation scope that restores the original indent level when the
+// object goes out of scope (RAII).
+Stream::IndentScope Stream::MakeIndentScope(unsigned indent_amount) {
+  IndentScope indent_scope(*this);
+  IndentMore(indent_amount);
+  return indent_scope;
+}
+
 // Get the address size in bytes
 uint32_t Stream::GetAddressByteSize() const { return m_addr_size; }
 
diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test
new file mode 100644
index 0000000000000..a9557801cc134
--- /dev/null
+++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test
@@ -0,0 +1,36 @@
+# Test format (e.g., indentation) when printing the list of stop hooks.
+#
+# RUN: %lldb -b -s %s | FileCheck %s --match-full-lines --strict-whitespace
+
+# Create some stop hooks
+target stop-hook add -o 'print "Hello"' --auto-continue true --at-initial-stop true
+target stop-hook add -o 'print "world,"' -o 'print "nice"' --file 'my_file'
+target stop-hook add -o 'print "weather!"'  --classname 'MyClass' --thread-name 'my_thread'
+
+# Print hooks
+target stop-hook list
+
+# CHECK:(lldb) target stop-hook list
+# CHECK:Hook: 1
+# CHECK:  State: enabled
+# CHECK:  AutoContinue on
+# CHECK:  Commands:
+# CHECK:      print "Hello"
+# CHECK-EMPTY:
+# CHECK:Hook: 2
+# CHECK:  State: enabled
+# CHECK:  Specifier:
+# CHECK:    File: my_file.
+# CHECK:  Commands:
+# CHECK:      print "world,"
+# CHECK:      print "nice"
+# CHECK-EMPTY:
+# CHECK:Hook: 3
+# CHECK:  State: enabled
+# CHECK:  Specifier:
+# CHECK:    Class name: MyClass.
+# CHECK:  Thread:
+# CHECK:    thread name: "my_thread" 
+# CHECK:  Commands:
+# CHECK:      print "weather!"
+# CHECK-EMPTY:

From 6fe3eccdf44fc8adb46e78e65edadd57926d2fb6 Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole@nvidia.com>
Date: Mon, 3 Nov 2025 13:34:44 -0800
Subject: [PATCH 077/313] [llvm][DebugInfo] Emit 0/1 for constant boolean
 values (#151225)

Previously, sign-extending a 1-bit boolean operand in `#DBG_VALUE` would
convert `true` to -1 (i.e., 0xffffffffffffffff). However, DWARF treats
booleans as unsigned values, so this resulted in the attribute
`DW_AT_const_value(0xffffffffffffffff)` being emitted. As a result, the
debugger would display the value as `255` instead of `true`.

This change modifies the behavior to use zero-extension for 1-bit values
instead, ensuring that `true` is represented as 1. Consequently, the
DWARF attribute emitted is now `DW_AT_const_value(1)`, which allows the
debugger to correctly display the boolean as `true`.
---
 .../CodeGen/GlobalISel/MachineIRBuilder.cpp   |  4 ++-
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |  2 ++
 llvm/lib/Transforms/Utils/Local.cpp           |  6 +++-
 llvm/test/DebugInfo/debug-bool-const-value.ll | 29 +++++++++++++++++++
 llvm/unittests/Transforms/Utils/LocalTest.cpp |  2 +-
 5 files changed, 40 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/DebugInfo/debug-bool-const-value.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 4b4df98024f4a..637acd61c8a5f 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
   if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) {
     if (CI->getBitWidth() > 64)
       MIB.addCImm(CI);
-    else
+    else if (CI->getBitWidth() == 1)
       MIB.addImm(CI->getZExtValue());
+    else
+      MIB.addImm(CI->getSExtValue());
   } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) {
     MIB.addFPImm(CFP);
   } else if (isa<ConstantPointerNull>(NumericConstant)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index bb10cf687db8d..d84c3fb05bb24 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -733,6 +733,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() > 64)
       return MachineOperand::CreateCImm(CI);
+    if (CI->getBitWidth() == 1)
+      return MachineOperand::CreateImm(CI->getZExtValue());
     return MachineOperand::CreateImm(CI->getSExtValue());
   }
   if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 46f29030ddb05..a03cf6e953e35 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   // Create integer constant expression.
   auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * {
     const APInt &API = cast<ConstantInt>(&CV)->getValue();
-    std::optional<int64_t> InitIntOpt = API.trySExtValue();
+    std::optional<int64_t> InitIntOpt;
+    if (API.getBitWidth() == 1)
+      InitIntOpt = API.tryZExtValue();
+    else
+      InitIntOpt = API.trySExtValue();
     return InitIntOpt ? DIB.createConstantValueExpression(
                             static_cast<uint64_t>(*InitIntOpt))
                       : nullptr;
diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll
new file mode 100644
index 0000000000000..84cf993cf4aae
--- /dev/null
+++ b/llvm/test/DebugInfo/debug-bool-const-value.ll
@@ -0,0 +1,29 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: {{.*}}DW_TAG_variable
+; CHECK-NEXT: {{.*}} DW_AT_const_value     (1)
+; CHECK-NEXT: {{.*}} DW_AT_name    ("arg")
+
+define void @test() !dbg !5
+{
+entry:
+  call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6
+  ret void, !dbg !6
+}
+
+declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3")
+
+!llvm.dbg.cu = !{ !2 }
+!llvm.module.flags = !{ !9, !10 }
+
+!1 = !DIFile(directory: "", filename: "test")
+!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0)
+!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8)
+!4 = !DISubroutineType(types: !{null})
+!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2)
+!6 = !DILocation(column: 1, line: 5, scope: !5)
+!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3)
+!8 = !DIExpression()
+!9 = !{ i32 2, !"Dwarf Version", i32 4 }
+!10 = !{ i32 2, !"Debug Info Version", i32 3 }
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 4b53cc3d6e516..4908eda16e002 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1153,7 +1153,7 @@ TEST(Local, ExpressionForConstant) {
   IntegerType *Int1Ty = Type::getInt1Ty(Context);
   Expr = createExpression(ConstantInt::getTrue(Context), Int1Ty);
   EXPECT_NE(Expr, nullptr);
-  EXPECT_EQ(Expr->getElement(1), 18446744073709551615U);
+  EXPECT_EQ(Expr->getElement(1), 1U);
 
   Expr = createExpression(ConstantInt::getFalse(Context), Int1Ty);
   EXPECT_NE(Expr, nullptr);

From 1621486d676f0ebfd6e64b6e93e31dcffa27c5ec Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 3 Nov 2025 13:37:39 -0800
Subject: [PATCH 078/313] [lldb-dap] Add support for launching supported
 clients (#165941)

Support launching a supported DAP client using the lldb-dap binary.
Currently, only the official LLDB-DAP Visual Studio Code extension is
supported. It uses the VS Code launch URL format.

Here's an example:

```
lldb-dap --client vscode -- /path/to/exe foo bar
```

This will open the following URL with `code --open-url`:

```
vscode://llvm-vs-code-extensions.lldb-dap/start?program=/path/to/exe&args=foo&arg=bar
```

Fixes #125777
---
 lldb/test/Shell/DAP/TestClientLauncher.test |  2 +
 lldb/tools/lldb-dap/CMakeLists.txt          |  1 +
 lldb/tools/lldb-dap/ClientLauncher.cpp      | 74 +++++++++++++++++++++
 lldb/tools/lldb-dap/ClientLauncher.h        | 50 ++++++++++++++
 lldb/tools/lldb-dap/tool/Options.td         |  8 +++
 lldb/tools/lldb-dap/tool/lldb-dap.cpp       | 38 +++++++++++
 lldb/unittests/DAP/CMakeLists.txt           |  1 +
 lldb/unittests/DAP/ClientLauncherTest.cpp   | 71 ++++++++++++++++++++
 8 files changed, 245 insertions(+)
 create mode 100644 lldb/test/Shell/DAP/TestClientLauncher.test
 create mode 100644 lldb/tools/lldb-dap/ClientLauncher.cpp
 create mode 100644 lldb/tools/lldb-dap/ClientLauncher.h
 create mode 100644 lldb/unittests/DAP/ClientLauncherTest.cpp

diff --git a/lldb/test/Shell/DAP/TestClientLauncher.test b/lldb/test/Shell/DAP/TestClientLauncher.test
new file mode 100644
index 0000000000000..a79a940da5a98
--- /dev/null
+++ b/lldb/test/Shell/DAP/TestClientLauncher.test
@@ -0,0 +1,2 @@
+# RUN: lldb-dap --client vscode-url -- /path/to/foo | FileCheck %s
+# CHECK: vscode://llvm-vs-code-extensions.lldb-dap/start?program=%2Fpath%2Fto%2Ffoo
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index dd1bbbdddfc59..fa940b7b73943 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS Support)
 add_lldb_library(lldbDAP
   Breakpoint.cpp
   BreakpointBase.cpp
+  ClientLauncher.cpp
   CommandPlugins.cpp
   DAP.cpp
   DAPError.cpp
diff --git a/lldb/tools/lldb-dap/ClientLauncher.cpp b/lldb/tools/lldb-dap/ClientLauncher.cpp
new file mode 100644
index 0000000000000..4cac1d6346441
--- /dev/null
+++ b/lldb/tools/lldb-dap/ClientLauncher.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClientLauncher.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace lldb_dap;
+
+std::optional<ClientLauncher::Client>
+ClientLauncher::GetClientFrom(llvm::StringRef str) {
+  return llvm::StringSwitch<std::optional<ClientLauncher::Client>>(str.lower())
+      .Case("vscode", ClientLauncher::VSCode)
+      .Case("vscode-url", ClientLauncher::VSCodeURL)
+      .Default(std::nullopt);
+}
+
+std::unique_ptr<ClientLauncher>
+ClientLauncher::GetLauncher(ClientLauncher::Client client) {
+  switch (client) {
+  case ClientLauncher::VSCode:
+    return std::make_unique<VSCodeLauncher>();
+  case ClientLauncher::VSCodeURL:
+    return std::make_unique<VSCodeURLPrinter>();
+  }
+  return nullptr;
+}
+
+std::string VSCodeLauncher::URLEncode(llvm::StringRef str) {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  for (char c : str) {
+    if (std::isalnum(c) || llvm::StringRef("-_.~").contains(c))
+      os << c;
+    else
+      os << '%' << llvm::utohexstr(c, false, 2);
+  }
+  return os.str();
+}
+
+std::string
+VSCodeLauncher::GetLaunchURL(const std::vector<llvm::StringRef> args) const {
+  assert(!args.empty() && "empty launch args");
+
+  std::vector<std::string> encoded_launch_args;
+  for (llvm::StringRef arg : args)
+    encoded_launch_args.push_back(URLEncode(arg));
+
+  const std::string args_str = llvm::join(encoded_launch_args, "&args=");
+  return llvm::formatv(
+             "vscode://llvm-vs-code-extensions.lldb-dap/start?program={0}",
+             args_str)
+      .str();
+}
+
+llvm::Error VSCodeLauncher::Launch(const std::vector<llvm::StringRef> args) {
+  const std::string launch_url = GetLaunchURL(args);
+  const std::string command =
+      llvm::formatv("code --open-url {0}", launch_url).str();
+
+  std::system(command.c_str());
+  return llvm::Error::success();
+}
+
+llvm::Error VSCodeURLPrinter::Launch(const std::vector<llvm::StringRef> args) {
+  llvm::outs() << GetLaunchURL(args) << '\n';
+  return llvm::Error::success();
+}
diff --git a/lldb/tools/lldb-dap/ClientLauncher.h b/lldb/tools/lldb-dap/ClientLauncher.h
new file mode 100644
index 0000000000000..780b178d2d6ef
--- /dev/null
+++ b/lldb/tools/lldb-dap/ClientLauncher.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H
+#define LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace lldb_dap {
+
+class ClientLauncher {
+public:
+  enum Client {
+    VSCode,
+    VSCodeURL,
+  };
+
+  virtual ~ClientLauncher() = default;
+  virtual llvm::Error Launch(const std::vector<llvm::StringRef> args) = 0;
+
+  static std::optional<Client> GetClientFrom(llvm::StringRef str);
+  static std::unique_ptr<ClientLauncher> GetLauncher(Client client);
+};
+
+class VSCodeLauncher : public ClientLauncher {
+public:
+  using ClientLauncher::ClientLauncher;
+
+  llvm::Error Launch(const std::vector<llvm::StringRef> args) override;
+
+  std::string GetLaunchURL(const std::vector<llvm::StringRef> args) const;
+  static std::string URLEncode(llvm::StringRef str);
+};
+
+class VSCodeURLPrinter : public VSCodeLauncher {
+  using VSCodeLauncher::VSCodeLauncher;
+
+  llvm::Error Launch(const std::vector<llvm::StringRef> args) override;
+};
+
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/tool/Options.td b/lldb/tools/lldb-dap/tool/Options.td
index 5e9dd7a1d6419..339a64fed6c32 100644
--- a/lldb/tools/lldb-dap/tool/Options.td
+++ b/lldb/tools/lldb-dap/tool/Options.td
@@ -82,3 +82,11 @@ def connection_timeout: S<"connection-timeout">,
     "timeout is reached, the server will be closed and the process will exit. "
     "Not specifying this argument or specifying non-positive values will "
     "cause the server to wait for new connections indefinitely.">;
+
+def client
+    : S<"client">,
+      MetaVarName<"<client>">,
+      HelpText<
+          "Use lldb-dap as a launcher for a curated number of DAP client.">;
+
+def REM : R<["--"], "">;
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index 45caa1a81059b..f10ed12344cbd 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ClientLauncher.h"
 #include "DAP.h"
 #include "DAPLog.h"
 #include "EventHelper.h"
@@ -141,6 +142,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) {
   debugger to attach to the process.
 
     lldb-dap -g
+
+  You can also use lldb-dap to launch a supported client, for example the
+  LLDB-DAP Visual Studio Code extension.
+
+    lldb-dap --client vscode -- /path/to/binary <args>
+
 )___";
 }
 
@@ -150,6 +157,29 @@ static void PrintVersion() {
   llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n';
 }
 
+static llvm::Error LaunchClient(const llvm::opt::InputArgList &args) {
+  auto *client_arg = args.getLastArg(OPT_client);
+  assert(client_arg && "must have client arg");
+
+  std::optional<ClientLauncher::Client> client =
+      ClientLauncher::GetClientFrom(client_arg->getValue());
+  if (!client)
+    return llvm::createStringError(
+        llvm::formatv("unsupported client: {0}", client_arg->getValue()));
+
+  std::vector<llvm::StringRef> launch_args;
+  if (auto *arg = args.getLastArgNoClaim(OPT_REM)) {
+    for (auto *value : arg->getValues()) {
+      launch_args.push_back(value);
+    }
+  }
+
+  if (launch_args.empty())
+    return llvm::createStringError("no launch arguments provided");
+
+  return ClientLauncher::GetLauncher(*client)->Launch(launch_args);
+}
+
 #if not defined(_WIN32)
 struct FDGroup {
   int GetFlags() const {
@@ -541,6 +571,14 @@ int main(int argc, char *argv[]) {
     return EXIT_SUCCESS;
   }
 
+  if (input_args.hasArg(OPT_client)) {
+    if (llvm::Error error = LaunchClient(input_args)) {
+      llvm::WithColor::error() << llvm::toString(std::move(error)) << '\n';
+      return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+  }
+
   ReplMode default_repl_mode = ReplMode::Auto;
   if (input_args.hasArg(OPT_repl_mode)) {
     llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode);
diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt
index a08414c30e6cd..b1fdef18fddba 100644
--- a/lldb/unittests/DAP/CMakeLists.txt
+++ b/lldb/unittests/DAP/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_lldb_unittest(DAPTests
+  ClientLauncherTest.cpp
   DAPErrorTest.cpp
   DAPTest.cpp
   DAPTypesTest.cpp
diff --git a/lldb/unittests/DAP/ClientLauncherTest.cpp b/lldb/unittests/DAP/ClientLauncherTest.cpp
new file mode 100644
index 0000000000000..dbaf9ee786336
--- /dev/null
+++ b/lldb/unittests/DAP/ClientLauncherTest.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClientLauncher.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+#include <optional>
+
+using namespace lldb_dap;
+using namespace llvm;
+
+TEST(ClientLauncherTest, GetClientFromVSCode) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("vscode");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromVSCodeUpperCase) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("VSCODE");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromVSCodeMixedCase) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("VSCode");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromInvalidString) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("invalid");
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(ClientLauncherTest, GetClientFromEmptyString) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("");
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(ClientLauncherTest, URLEncode) {
+  EXPECT_EQ("", VSCodeLauncher::URLEncode(""));
+  EXPECT_EQ(
+      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.~",
+      VSCodeLauncher::URLEncode("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST"
+                                "UVWXYZ0123456789-_.~"));
+  EXPECT_EQ("hello%20world", VSCodeLauncher::URLEncode("hello world"));
+  EXPECT_EQ("hello%21%40%23%24", VSCodeLauncher::URLEncode("hello!@#$"));
+  EXPECT_EQ("%2Fpath%2Fto%2Ffile", VSCodeLauncher::URLEncode("/path/to/file"));
+  EXPECT_EQ("key%3Dvalue%26key2%3Dvalue2",
+            VSCodeLauncher::URLEncode("key=value&key2=value2"));
+  EXPECT_EQ("100%25complete", VSCodeLauncher::URLEncode("100%complete"));
+  EXPECT_EQ("file_name%20with%20spaces%20%26%20special%21.txt",
+            VSCodeLauncher::URLEncode("file_name with spaces & special!.txt"));
+  EXPECT_EQ("%00%01%02",
+            VSCodeLauncher::URLEncode(llvm::StringRef("\x00\x01\x02", 3)));
+  EXPECT_EQ("test-file_name.txt~",
+            VSCodeLauncher::URLEncode("test-file_name.txt~"));
+
+  // UTF-8 encoded characters should be percent-encoded byte by byte.
+  EXPECT_EQ("%C3%A9", VSCodeLauncher::URLEncode("é"));
+}

From 5e8a0d64952f374ea0194d3d2876d1deeb8c5320 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 3 Nov 2025 13:37:46 -0800
Subject: [PATCH 079/313] [RISCV] Use reportFatalUsageError in a few places
 (#166218)

Makes it consistent with feedback given in the equivalent GISel code.
https://github.com/llvm/llvm-project/pull/165876
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b25a05400fe31..907833513c5d1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-    report_fatal_error("The V extension does not support EEW=64 for index "
-                       "values when XLEN=32");
+    reportFatalUsageError("The V extension does not support EEW=64 for index "
+                          "values when XLEN=32");
   }
   const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-    report_fatal_error("The V extension does not support EEW=64 for index "
-                       "values when XLEN=32");
+    reportFatalUsageError("The V extension does not support EEW=64 for index "
+                          "values when XLEN=32");
   }
   const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -2223,8 +2223,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-        report_fatal_error("The V extension does not support EEW=64 for index "
-                           "values when XLEN=32");
+        reportFatalUsageError("The V extension does not support EEW=64 for "
+                              "index values when XLEN=32");
       }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
           IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -2457,8 +2457,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-        report_fatal_error("The V extension does not support EEW=64 for index "
-                           "values when XLEN=32");
+        reportFatalUsageError("The V extension does not support EEW=64 for "
+                              "index values when XLEN=32");
       }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
           IsMasked, IsOrdered, IndexLog2EEW,

From bc0d0cf3ac9b5f1c2ec33d22aba8e7ece7d08ed2 Mon Sep 17 00:00:00 2001
From: Doug Wyatt <doug@sonosphere.com>
Date: Mon, 3 Nov 2025 13:39:56 -0800
Subject: [PATCH 080/313] [Clang] FunctionEffect analysis was missing a
 CXXBindTemporaryExpr's implicit call to a destructor. (#166110)

This example is reduced from a discovery: resetting a shared pointer
from a nonblocking function is not diagnosed.

```
void nb23()
{
	struct X {
		int *ptr = nullptr;
		X() {}
		~X() { delete ptr; }
	};

	auto inner = []() [[clang::nonblocking]] {
		X();
	};
}
```

`shared_ptr<T>::reset()` creates a temporary `shared_ptr` and swaps it
with its current state. The temporary `shared_ptr` constructor is
nonblocking but its destructor potentially deallocates memory and is
unsafe.

Analysis was ignoring the implicit call in the AST to destroy the
temporary.

---------

Co-authored-by: Doug Wyatt <dwyatt@apple.com>
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaFunctionEffects.cpp        |  8 ++++++
 .../Sema/attr-nonblocking-constraints.cpp     | 27 +++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cd272396252d0..db695d86b5416 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -460,6 +460,7 @@ Bug Fixes to Attribute Support
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
 - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
 - Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
+- Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 5459861ec349d..4b63eb7df1054 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -1279,7 +1279,15 @@ class Analyzer {
       const CXXConstructorDecl *Ctor = Construct->getConstructor();
       CallableInfo CI(*Ctor);
       followCall(CI, Construct->getLocation());
+      return true;
+    }
 
+    bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *BTE) override {
+      const CXXDestructorDecl *Dtor = BTE->getTemporary()->getDestructor();
+      if (Dtor != nullptr) {
+        CallableInfo CI(*Dtor);
+        followCall(CI, BTE->getBeginLoc());
+      }
       return true;
     }
 
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index 0d2dbb4947dc8..881e816292d59 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -373,6 +373,33 @@ struct Unsafe {
   Unsafe(float y) [[clang::nonblocking]] : Unsafe(int(y)) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
 };
 
+// Exercise cases of a temporary with a safe constructor and unsafe destructor.
+void nb23()
+{
+	struct X {
+		int *ptr = nullptr;
+		X() {}
+		~X() { delete ptr; } // expected-note 2 {{destructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
+	};
+
+	auto inner = []() [[clang::nonblocking]] {
+		X(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
+	};
+
+	auto inner2 = [](X x) [[clang::nonblocking]] { // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
+	};
+
+}
+
+struct S2 { ~S2(); }; // expected-note 2 {{declaration cannot be inferred 'nonblocking' because it has no definition in this translation unit}}
+void nb24() {
+    S2 s;
+    [&]() [[clang::nonblocking]] {
+        [s]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
+        [=]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
+    }();
+}
+
 struct DerivedFromUnsafe : public Unsafe {
   DerivedFromUnsafe() [[clang::nonblocking]] {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
   DerivedFromUnsafe(int x) [[clang::nonblocking]] : Unsafe(x) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}

From 1d8d8dc6b53aea14d5a60169e426416af7cc273f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 3 Nov 2025 21:43:11 +0000
Subject: [PATCH 081/313] [gn build] Port 1621486d676f

---
 llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index a42f781271a21..b6c2f465a7292 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -21,6 +21,7 @@ static_library("lib") {
   sources = [
     "Breakpoint.cpp",
     "BreakpointBase.cpp",
+    "ClientLauncher.cpp",
     "CommandPlugins.cpp",
     "DAP.cpp",
     "DAPError.cpp",

From a8de6499c240abbcb2c6b51e6b59cc048a7533c2 Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Mon, 3 Nov 2025 13:54:22 -0800
Subject: [PATCH 082/313] [NFC][LLDB][BoundsSatety] Add
 `InstrumentationRuntime::MatchAllModules` (#166001)

This adds a virtual method that allows `InstrumentationRuntime` sub
classes to match against all modules rather than just a library that
matches a particular regex. When the implementation returns true
`GetPatternForRuntimeLibrary()` is ignored and all modules are iterated
over. The default implementation returns false which was the previous
behavior which uses `GetPatternForRuntimeLibrary()` to only match a
particular runtime library.

The intended use case here is for implementing an
`InstrumentationRuntime` where the runtime library of interest can have
multiple implementations and whose name is not known ahead of time.

The concrete use case here is for a `InstrumentationRuntime` plugin for
implementations of the `-fbounds-safety` soft-trap runtime which can
have multiple different implementations and so the module containing the
runtime functions isn't known ahead of time. This plug-in will be
upstreamed as part of the process of upstreaming `-fbounds-safety`.

An alternative to this would be for the `GetPatternForRuntimeLibrary()`
function to return a regex that matches everything. While that
technically works this new API more clearly indicates in the intent. We
probably also save a little perf by not executing the regex match for
every loaded module but I have not measured this.

rdar://163230807
---
 lldb/include/lldb/Target/InstrumentationRuntime.h | 7 +++++++
 lldb/source/Target/InstrumentationRuntime.cpp     | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lldb/include/lldb/Target/InstrumentationRuntime.h b/lldb/include/lldb/Target/InstrumentationRuntime.h
index a6121c24b9560..d2499528e97ab 100644
--- a/lldb/include/lldb/Target/InstrumentationRuntime.h
+++ b/lldb/include/lldb/Target/InstrumentationRuntime.h
@@ -73,6 +73,13 @@ class InstrumentationRuntime
   /// is guaranteed to be loaded.
   virtual void Activate() = 0;
 
+  /// \return true if `CheckIfRuntimeIsValid` should be called on all modules.
+  /// In this case the return value of `GetPatternForRuntimeLibrary` will be
+  /// ignored. Return false if `CheckIfRuntimeIsValid` should only be called
+  /// for modules whose name matches `GetPatternForRuntimeLibrary`.
+  ///
+  virtual bool MatchAllModules() { return false; }
+
 public:
   static void ModulesDidLoad(lldb_private::ModuleList &module_list,
                              Process *process,
diff --git a/lldb/source/Target/InstrumentationRuntime.cpp b/lldb/source/Target/InstrumentationRuntime.cpp
index 7e58e8bf26cb1..d9800a8541f4e 100644
--- a/lldb/source/Target/InstrumentationRuntime.cpp
+++ b/lldb/source/Target/InstrumentationRuntime.cpp
@@ -55,7 +55,8 @@ void InstrumentationRuntime::ModulesDidLoad(
       return IterationAction::Continue;
 
     const RegularExpression &runtime_regex = GetPatternForRuntimeLibrary();
-    if (runtime_regex.Execute(file_spec.GetFilename().GetCString()) ||
+    if (MatchAllModules() ||
+        runtime_regex.Execute(file_spec.GetFilename().GetCString()) ||
         module_sp->IsExecutable()) {
       if (CheckIfRuntimeIsValid(module_sp)) {
         SetRuntimeModuleSP(module_sp);

From f77ce52b56d025399f489a8c0aad8c18c4b06045 Mon Sep 17 00:00:00 2001
From: jinge90 <ge.jin@intel.com>
Date: Tue, 4 Nov 2025 05:55:38 +0800
Subject: [PATCH 083/313] [Driver][NFC] Don't specify offloading model in help
 text for -Xarch_device/host (#165503)

Current implementation for -Xarch_device/host can work in any offloading
model besides CUDA/HIP, so remove the specific offloading model in help
text to align with implementation.

---------

Signed-off-by: jinge90 <ge.jin@intel.com>
Co-authored-by: Alexey Bader <alexey.bader@intel.com>
Co-authored-by: Joseph Huber <huberjn@outlook.com>
---
 clang/include/clang/Driver/Options.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5cf332783cbc3..11e81e032d5fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -951,9 +951,9 @@ def Xarch__
   the host system, which can be used to suppress incompatible GPU arguments.}]>,
       MetaVarName<"<arch> <arg>">;
 def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>,
-  HelpText<"Pass <arg> to the CUDA/HIP host compilation">, MetaVarName<"<arg>">;
+  HelpText<"Pass <arg> to host compilation in the offloading toolchain">, MetaVarName<"<arg>">;
 def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>,
-  HelpText<"Pass <arg> to the CUDA/HIP device compilation">, MetaVarName<"<arg>">;
+  HelpText<"Pass <arg> to device compilation in the offloading toolchain">, MetaVarName<"<arg>">;
 def Xassembler : Separate<["-"], "Xassembler">,
   HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
   Group<CompileOnly_Group>;

From ac21fde46488c8c46f1cabf8205ecd73c3276765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 3 Nov 2025 11:56:22 -1000
Subject: [PATCH 084/313] [flang][cuda] Add support for f16 atomicadd (#166229)

---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |  2 +
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 43 ++++++++++++++++++-
 flang/module/cudadevice.f90                   |  5 +++
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |  4 ++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3407dd01dd504..9f15ce68eb3d5 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -188,6 +188,8 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicAddR2(mlir::Type,
+                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAtomicCas(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 15ea84565dd75..6be4d9ce0a46c 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -294,6 +294,10 @@ static constexpr IntrinsicHandler handlers[]{
     {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
+    {"atomicaddr2",
+     &I::genAtomicAddR2,
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
     {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomiccasd",
      &I::genAtomicCas,
@@ -3119,7 +3123,6 @@ static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
 mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
                                            llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
-
   mlir::LLVM::AtomicBinOp binOp =
       mlir::isa<mlir::IntegerType>(args[1].getType())
           ? mlir::LLVM::AtomicBinOp::add
@@ -3127,6 +3130,44 @@ mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
   return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
 }
 
+fir::ExtendedValue
+IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
+                                 llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+
+  mlir::Value a = fir::getBase(args[0]);
+
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+
+  auto loc = builder.getUnknownLoc();
+  auto f16Ty = builder.getF16Type();
+  auto i32Ty = builder.getI32Type();
+  auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
+  mlir::Type idxTy = builder.getIndexType();
+  auto f16RefTy = fir::ReferenceType::get(f16Ty);
+  auto zero = builder.createIntegerConstant(loc, idxTy, 0);
+  auto one = builder.createIntegerConstant(loc, idxTy, 1);
+  auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), zero);
+  auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), one);
+  auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
+  auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
+  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
+  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+  auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
+  auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
+  mlir::Value vecI32 =
+      mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
+  return mlir::vector::ExtractOp::create(builder, loc, vecI32,
+                                         mlir::ArrayRef<int64_t>{0});
+}
+
 mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
                                            llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 59af58ddcd32e..7a764b589dc56 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1171,6 +1171,11 @@ attributes(device) pure integer(8) function atomicaddl(address, val)
     integer(8), intent(inout) :: address
     integer(8), value :: val
     end function
+    attributes(device) pure integer(4) function atomicaddr2(address, val)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(2), dimension(2), intent(inout) :: address
+      real(2), dimension(2), intent(in) :: val
+    end function
   end interface
 
   interface atomicsub
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 09b4302446ee7..674548b7489e8 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -14,6 +14,8 @@ attributes(global) subroutine devsub()
   integer :: smalltime
   integer(4) :: res, offset
   integer(8) :: resl
+  real(2) :: r2a(2)
+  real(2) :: tmp2(2)
 
   integer :: tid
   tid = threadIdx%x
@@ -34,6 +36,7 @@ attributes(global) subroutine devsub()
   al = atomicadd(al, 1_8)
   af = atomicadd(af, 1.0_4)
   ad = atomicadd(ad, 1.0_8)
+  ai = atomicadd(r2a, tmp2)
   
   ai = atomicsub(ai, 1_4)
   al = atomicsub(al, 1_8)
@@ -128,6 +131,7 @@ end
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
 
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64

From f62ff29157cb75a17b087475168fa15ddcb7c059 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 3 Nov 2025 17:00:57 -0500
Subject: [PATCH 085/313] [mlir][spirv] Fix Intel SubgroupBlock* op tests
 (#166185)

`spirv-val` learned how to check related ops. Move them to a new test
file and set the required capabilities.

Closes: https://github.com/llvm/llvm-project/issues/166184
---
 mlir/test/Target/SPIRV/group-ops.mlir         | 30 +++-------------
 .../Target/SPIRV/subgroup-block-intel.mlir    | 34 +++++++++++++++++++
 2 files changed, 38 insertions(+), 26 deletions(-)
 create mode 100644 mlir/test/Target/SPIRV/subgroup-block-intel.mlir

diff --git a/mlir/test/Target/SPIRV/group-ops.mlir b/mlir/test/Target/SPIRV/group-ops.mlir
index cf519cba961c5..6f19b3553dd37 100644
--- a/mlir/test/Target/SPIRV/group-ops.mlir
+++ b/mlir/test/Target/SPIRV/group-ops.mlir
@@ -1,11 +1,13 @@
-// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
 
 // RUN: %if spirv-tools %{ rm -rf %t %}
 // RUN: %if spirv-tools %{ mkdir %t %}
 // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
 // RUN: %if spirv-tools %{ spirv-val %t %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, SubgroupBallotKHR, Groups, SubgroupBufferBlockIOINTEL, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_INTEL_subgroups, SPV_KHR_uniform_group_instructions]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3,
+  [Shader, Linkage, SubgroupBallotKHR, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR],
+  [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_KHR_uniform_group_instructions]> {
   // CHECK-LABEL: @subgroup_ballot
   spirv.func @subgroup_ballot(%predicate: i1) -> vector<4xi32> "None" {
     // CHECK: %{{.*}} = spirv.KHR.SubgroupBallot %{{.*}}: vector<4xi32>
@@ -24,30 +26,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, Subgrou
     %0 = spirv.GroupBroadcast <Workgroup> %value, %localid : f32, vector<3xi32>
     spirv.ReturnValue %0: f32
   }
-  // CHECK-LABEL: @subgroup_block_read_intel
-  spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32
-    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32
-    spirv.ReturnValue %0: i32
-  }
-  // CHECK-LABEL: @subgroup_block_read_intel_vector
-  spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
-    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
-    spirv.ReturnValue %0: vector<3xi32>
-  }
-  // CHECK-LABEL: @subgroup_block_write_intel
-  spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32
-    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32
-    spirv.Return
-  }
-  // CHECK-LABEL: @subgroup_block_write_intel_vector
-  spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32>
-    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32>
-    spirv.Return
-  }
   // CHECK-LABEL: @group_iadd
   spirv.func @group_iadd(%value: i32) -> i32 "None" {
     // CHECK: spirv.GroupIAdd <Workgroup> <Reduce> %{{.*}} : i32
diff --git a/mlir/test/Target/SPIRV/subgroup-block-intel.mlir b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir
new file mode 100644
index 0000000000000..14060e632fffd
--- /dev/null
+++ b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Physical64 GLSL450 requires #spirv.vce<v1.3, [Addresses, Shader, Linkage, SubgroupBufferBlockIOINTEL],
+                                                          [SPV_KHR_storage_buffer_storage_class, SPV_INTEL_subgroups]> {
+  // CHECK-LABEL: @subgroup_block_read_intel
+  spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32
+    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32
+    spirv.ReturnValue %0: i32
+  }
+  // CHECK-LABEL: @subgroup_block_read_intel_vector
+  spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
+    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
+    spirv.ReturnValue %0: vector<3xi32>
+  }
+  // CHECK-LABEL: @subgroup_block_write_intel
+  spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32
+    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32
+    spirv.Return
+  }
+  // CHECK-LABEL: @subgroup_block_write_intel_vector
+  spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32>
+    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32>
+    spirv.Return
+  }
+}

From 2c8781de64891d1614bf54dd9e5e4f183a8acca3 Mon Sep 17 00:00:00 2001
From: Jacob Lambert <jacob.lambert@amd.com>
Date: Mon, 3 Nov 2025 14:09:09 -0800
Subject: [PATCH 086/313] Revert "[Clang] Make the AS of llvm.compiler.used &
 llvm.used elements addrspace(0)" (#166242)

Reverts llvm/llvm-project#164432

Breaks Comgr tests with the following:


[2025-11-03T19:18:20.564Z] + clang -x hip --offload-arch=amdgcnspirv
-nogpulib -nogpuinc --no-gpu-bundle-output --offload-device-only -O3
/jenkins/workspace/compiler-psdb-amd-staging/repos/llvm-project/amd/comgr/test-lit/spirv-tests/spirv-to-reloc.hip
-o
/jenkins/workspace/compiler-psdb-amd-staging/repos/out/ubuntu-22.04/22.04/build/amd_comgr/test-lit/spirv-tests/Output/spirv-to-reloc.hip.tmp.spv
-fvisibility=hidden -fno-autolink -fexceptions -fcolor-diagnostics

[2025-11-03T19:18:20.564Z] InvalidModule: Invalid SPIR-V module: Casts
from private/local/global address space are allowed only to generic

[2025-11-03T19:18:20.564Z]

[2025-11-03T19:18:20.564Z] <badref> = addrspacecast ptr addrspace(1)
@__hip_cuid_94fb83be5559070 to ptr

[2025-11-03T19:18:20.564Z] clang: error: amdgcn-link command failed with
exit code 10 (use -v to see invocation)
---
 clang/lib/CodeGen/CodeGenModule.cpp           | 12 ++--
 .../embed-bitcode-marker-with-nonzero-as.c    |  2 +-
 .../llvm_compiler_used_elements_are_unqual.c  | 64 -------------------
 3 files changed, 7 insertions(+), 71 deletions(-)
 delete mode 100644 clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index af5be95aec1cd..0fea57b2e1799 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3331,18 +3331,18 @@ static void emitUsed(CodeGenModule &CGM, StringRef Name,
   if (List.empty())
     return;
 
-  llvm::PointerType *UnqualPtr =
-      llvm::PointerType::getUnqual(CGM.getLLVMContext());
-
   // Convert List to what ConstantArray needs.
   SmallVector<llvm::Constant*, 8> UsedArray;
   UsedArray.resize(List.size());
   for (unsigned i = 0, e = List.size(); i != e; ++i) {
-    UsedArray[i] = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-        cast<llvm::Constant>(&*List[i]), UnqualPtr);
+    UsedArray[i] =
+        llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+            cast<llvm::Constant>(&*List[i]), CGM.Int8PtrTy);
   }
 
-  llvm::ArrayType *ATy = llvm::ArrayType::get(UnqualPtr, UsedArray.size());
+  if (UsedArray.empty())
+    return;
+  llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size());
 
   auto *GV = new llvm::GlobalVariable(
       CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage,
diff --git a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
index 8af9708a1bfb8..df7118859c764 100644
--- a/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
+++ b/clang/test/CodeGen/embed-bitcode-marker-with-nonzero-as.c
@@ -3,6 +3,6 @@
 
 // CHECK: @llvm.embedded.module = private addrspace(1) constant [0 x i8] zeroinitializer, section ".llvmbc", align 1
 // CHECK-NEXT: @llvm.cmdline = private addrspace(1) constant [{{[0-9]+}} x i8] c"{{.*}}", section ".llvmcmd", align 1
-// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo.managed to ptr), ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr), ptr addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr)], section "llvm.metadata"
+// CHECK-NEXT: @llvm.compiler.used = appending addrspace(1) global [5 x ptr addrspace(4)] [ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo.managed to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.embedded.module to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr addrspace(1) @llvm.cmdline to ptr addrspace(4))], section "llvm.metadata"
 
 __attribute__((managed)) int foo = 42;
diff --git a/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c b/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c
deleted file mode 100644
index b6550fb1e5c77..0000000000000
--- a/clang/test/CodeGen/llvm_compiler_used_elements_are_unqual.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// RUN: %clang_cc1 -x c -triple x86_64-- -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=X86
-// RUN: %clang_cc1 -x c -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=AMDGCN
-// RUN: %clang_cc1 -x c -triple spirv64-- -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=SPIRV
-// RUN: %clang_cc1 -x c -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=SPIRV_AMD
-// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple x86_64-- -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=X86
-// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=AMDGCN
-// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-- -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=SPIRV_CL
-// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=SPIRV_AMD_CL
-// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple x86_64-- -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=X86
-// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=AMDGCN
-// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-- -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=SPIRV_CL
-// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -x c %s -o - \
-// RUN:   | FileCheck %s --check-prefix=SPIRV_AMD_CL
-
-#ifndef __OPENCL_C_VERSION__
-#define __constant const
-#endif
-
-static __constant __attribute__((__used__)) int foo = 42;
-
-
-// X86: @foo = internal constant i32 42
-// X86: @llvm.compiler.used = appending global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata"
-//
-// AMDGCN: @foo = internal addrspace(4) constant i32 42
-// AMDGCN: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(4) @foo to ptr), ptr @bar], section "llvm.metadata"
-//
-// SPIRV: @foo = internal constant i32 42
-// SPIRV: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata"
-//
-// SPIRV_CL: @foo = internal addrspace(2) constant i32 42
-// SPIRV_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr @bar], section "llvm.metadata"
-//
-// SPIRV_AMD: @foo = internal addrspace(1) constant i32 42
-// SPIRV_AMD: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata"
-//
-// SPIRV_AMD_CL: @foo = internal addrspace(2) constant i32 42
-// SPIRV_AMD_CL: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(2) @foo to ptr), ptr addrspacecast (ptr addrspace(4) @bar to ptr)], section "llvm.metadata"
-//
-// X86: define internal void @bar() #{{[0-9]}} {
-//
-// AMDGCN: define internal void @bar() #{{[0-9]}} {
-//
-// SPIRV: define internal spir_func void @bar() #{{[0-9]}} {
-//
-// SPIRV_CL: define internal spir_func void @bar() #{{[0-9]}} {
-//
-// SPIRV_AMD: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} {
-//
-// SPIRV_AMD_CL: define internal spir_func void @bar() addrspace(4) #{{[0-9]}} {
-//
-static void __attribute__((__used__)) bar() {
-}

From ed53c413ef51abca52a258c7ddc271e81ec22be0 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 3 Nov 2025 14:12:58 -0800
Subject: [PATCH 087/313] [profcheck] Exclude more tools tests (#166239)

Excluding test areas that (1) don't really pertain to the profcheck effort, and (2) are easier to maintain this way.
---
 llvm/test/lit.cfg.py           |  5 +++++
 llvm/utils/profcheck-xfail.txt | 26 --------------------------
 2 files changed, 5 insertions(+), 26 deletions(-)

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 974af4b571503..cadf781b409be 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -57,8 +57,13 @@
     # so we just exclude llvm-reduce tests from this config altogether. This should
     # be fine though as profcheck config tests are mostly concerned with opt.
     config.excludes.append("llvm-reduce")
+    # Exclude llvm-objcopy tests - not the target of this effort, and some use
+    # cat in ways that conflict with how profcheck uses it.
+    config.excludes.append("llvm-objcopy")
     # (Issue #161235) Temporarily exclude LoopVectorize.
     config.excludes.append("LoopVectorize")
+    # exclude UpdateTestChecks - they fail because of inserted prof annotations
+    config.excludes.append("UpdateTestChecks")
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 661c88125c9c8..61bc936cd151a 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -530,32 +530,6 @@ Instrumentation/TypeSanitizer/swifterror.ll
 LTO/X86/diagnostic-handler-remarks-with-hotness.ll
 Other/optimization-remarks-auto.ll
 Other/X86/debugcounter-partiallyinlinelibcalls.ll
-tools/llvm-objcopy/ELF/auto-remove-add-symtab-shndx.test
-tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test
-tools/UpdateTestChecks/update_analyze_test_checks/loop-distribute.test
-tools/UpdateTestChecks/update_test_checks/argument_name_reuse.test
-tools/UpdateTestChecks/update_test_checks/basic.test
-tools/UpdateTestChecks/update_test_checks/check_attrs.test
-tools/UpdateTestChecks/update_test_checks/difile_absolute_filenames.test
-tools/UpdateTestChecks/update_test_checks/filter_out_after.test
-tools/UpdateTestChecks/update_test_checks/generated_funcs_prefix_reuse.test
-tools/UpdateTestChecks/update_test_checks/generated_funcs.test
-tools/UpdateTestChecks/update_test_checks/global_preserve_name.test
-tools/UpdateTestChecks/update_test_checks/if_target.test
-tools/UpdateTestChecks/update_test_checks/named_function_arguments_split.test
-tools/UpdateTestChecks/update_test_checks/on_the_fly_arg_change.test
-tools/UpdateTestChecks/update_test_checks/phi-labels.test
-tools/UpdateTestChecks/update_test_checks/pre-process.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values2.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values3.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values4.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
-tools/UpdateTestChecks/update_test_checks/switch_case.test
-tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
-tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
 Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
 Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
 Transforms/AtomicExpand/AArch64/pcsections.ll

From a522ae3ef6e13cb39e7756c151652e03a024b301 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 14:17:39 -0800
Subject: [PATCH 088/313] ARM: Remove unnecessary manual ABI lowering for
 sincos_stret (#166040)

LowerCallTo handles all of the ABI details, including the load of
implicit sret return to the expected result positions.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 39 ++-----------------------
 1 file changed, 2 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6b0653457cbaf..3a00267395504 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -9869,32 +9869,12 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin());
 
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
 
   // Pair of floats / doubles used to pass the result.
   Type *RetTy = StructType::get(ArgTy, ArgTy);
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
-  bool ShouldUseSRet = getTM().isAPCS_ABI();
-  SDValue SRet;
-  if (ShouldUseSRet) {
-    // Create stack object for sret.
-    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
-    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
-
-    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Entry.IsSRet = true;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(*DAG.getContext());
-  }
-
   Args.emplace_back(Arg, ArgTy);
 
   StringRef LibcallName = getLibcallImplName(SincosStret);
@@ -9904,25 +9884,10 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args))
-      .setDiscardResult(ShouldUseSRet);
+      .setCallee(CC, RetTy, Callee, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
-  if (!ShouldUseSRet)
-    return CallResult.first;
-
-  SDValue LoadSin =
-      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
-
-  // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
-                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos =
-      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
-
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
-                     LoadSin.getValue(0), LoadCos.getValue(0));
+  return CallResult.first;
 }
 
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,

From 3c2c9d5bc1e2c8a1648f77d4a4a574c56c392442 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 14:18:04 -0800
Subject: [PATCH 089/313] DAG: Cleanup string bool attribute check for
 disable-tail-calls (#166237)

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a52265055c88a..fa0c899dfcc27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8958,9 +8958,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
   // Avoid emitting tail calls in functions with the disable-tail-calls
   // attribute.
   const Function *Caller = CB.getParent()->getParent();
-  if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
-          "true" &&
-      !isMustTailCall)
+  if (!isMustTailCall &&
+      Caller->getFnAttribute("disable-tail-calls").getValueAsBool())
     return false;
 
   // We can't tail call inside a function with a swifterror argument. Lowering

From ecaaebf8f0f70a0b6e21c19fdd4be23ffe23ef53 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 14:19:58 -0800
Subject: [PATCH 090/313] X86: Correct IR type used for sincos_stret return
 value (#166240)

Match the IR type that clang uses here: https://godbolt.org/z/KzbodEcxh

This was manually selecting the IR legal type. Instead just set the
flag to ensure legal types.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2970cf42df731..b97b5089cb0a3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33034,12 +33034,13 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)FixedVectorType::get(ArgTy, 4);
+                      : (Type *)FixedVectorType::get(ArgTy, 2);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
+      .setIsPostTypeLegalization();
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 

From 5d9d89092e0fd34f3ea2ca15c9f30d4c01bb29f3 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Mon, 3 Nov 2025 14:22:25 -0800
Subject: [PATCH 091/313] [NVPTX] Add more clear error message for using
 invalid syncscope (#165737)

Using invalid syncscopes on certain NVVM intrinsics causes an obscure
error to appear: (error 9: NVVM_ERROR_COMPILATION), libNVVM extra log:
Could not find scope ID=5.

This is not a very helpful error. A much more useful error would be
something like 'NVPTX does not support syncscope "agent"'

This would immediately make it clear that the issue is not NVPTX
specific, but actually from code being fed to NVPTX. This would save
users time in debugging issues related to this.
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   | 22 ++++++++++++++-----
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h     |  1 +
 .../cmpxchg-unsupported-syncscope.err.ll      | 11 ++++++++++
 3 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c667a09f95dbb..996d653940118 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
   return true;
 }
 
-NVPTXScopes::NVPTXScopes(LLVMContext &C) {
+NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) {
   Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
   Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
   Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
@@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
 
   auto S = Scopes.find(ID);
   if (S == Scopes.end()) {
-    // TODO:
-    // - Add API to LLVMContext to get the name of a single scope.
-    // - Use that API here to print an error containing the name
-    //   of this Unknown ID.
-    report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
+    auto scopeName = Context->getSyncScopeName(ID);
+    assert(scopeName.has_value() && "Scope name must exist.");
+
+    // Build list of supported syncscopes programmatically
+    SmallVector<StringRef> supportedScopes;
+    for (const auto &Entry : Scopes) {
+      if (auto name = Context->getSyncScopeName(Entry.first))
+        supportedScopes.push_back(name->empty() ? "<empty string>" : *name);
+    }
+
+    reportFatalUsageError(
+        formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
+                "Supported syncscopes are: {2}.",
+                scopeName.value(), int(ID),
+                make_range(supportedScopes.begin(), supportedScopes.end())));
   }
   return S->second;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 1cb579bd96730..d525531766ddf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -35,6 +35,7 @@ struct NVPTXScopes {
 
 private:
   SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{};
+  LLVMContext *Context = nullptr;
 };
 
 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
new file mode 100644
index 0000000000000..d3853e2fdaa88
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s
+
+; Test that we get a clear error message when using an unsupported syncscope.
+
+; CHECK: NVPTX backend does not support syncscope "agent"
+; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device
+define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) {
+  %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic
+  %value = extractvalue { i32, i1 } %result, 0
+  ret i32 %value
+}

From 0e3612e751381d35a3826a67527845e23bbd22af Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Mon, 3 Nov 2025 14:31:11 -0800
Subject: [PATCH 092/313] [scudo] Add config option to modify get usable size
 behavior (#158710)

Currently, Scudo always returns the exact size allocated when calling
getUsableSize. This can be a performance issue where some programs will
get the usable size and do unnecessary calls to realloc since they think
there isn't enough space in the allocation. By default, usable size will
still return the exact size of the allocation.

Note that if the exact behavior is disabled and MTE is on, then the code
will still give an exact usable size.
---
 .../lib/scudo/standalone/allocator_config.def |   4 +
 compiler-rt/lib/scudo/standalone/combined.h   |  88 +++++--
 .../scudo/standalone/tests/combined_test.cpp  | 242 ++++++++++++++++++
 .../standalone/tests/wrappers_c_test.cpp      |   9 +-
 4 files changed, 324 insertions(+), 19 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
index 748530820cd64..0aea7b8f2fb9a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.def
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -57,6 +57,10 @@ BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false)
 // Disable the quarantine code.
 BASE_OPTIONAL(const bool, QuarantineDisabled, false)
 
+// If set to true, malloc_usable_size returns the exact size of the allocation.
+// If set to false, return the total available size in the allocation.
+BASE_OPTIONAL(const bool, ExactUsableSize, true)
+
 // PRIMARY_REQUIRED_TYPE(NAME)
 //
 // SizeClassMap to use with the Primary.
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 329ec4596482b..ffe9554203241 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -706,19 +706,26 @@ class Allocator {
         if (!getChunkFromBlock(Block, &Chunk, &Header) &&
             !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
           return;
-      } else {
-        if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
-          return;
+      } else if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) {
+        return;
       }
-      if (Header.State == Chunk::State::Allocated) {
-        uptr TaggedChunk = Chunk;
-        if (allocatorSupportsMemoryTagging<AllocatorConfig>())
-          TaggedChunk = untagPointer(TaggedChunk);
-        if (useMemoryTagging<AllocatorConfig>(Primary.Options.load()))
-          TaggedChunk = loadTag(Chunk);
-        Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
-                 Arg);
+
+      if (Header.State != Chunk::State::Allocated)
+        return;
+
+      uptr TaggedChunk = Chunk;
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+        TaggedChunk = untagPointer(TaggedChunk);
+      uptr Size;
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) {
+        TaggedChunk = loadTag(Chunk);
+        Size = getSize(reinterpret_cast<void *>(Chunk), &Header);
+      } else if (AllocatorConfig::getExactUsableSize()) {
+        Size = getSize(reinterpret_cast<void *>(Chunk), &Header);
+      } else {
+        Size = getUsableSize(reinterpret_cast<void *>(Chunk), &Header);
       }
+      Callback(TaggedChunk, Size, Arg);
     };
     Primary.iterateOverBlocks(Lambda);
     Secondary.iterateOverBlocks(Lambda);
@@ -759,16 +766,50 @@ class Allocator {
     return false;
   }
 
-  // Return the usable size for a given chunk. Technically we lie, as we just
-  // report the actual size of a chunk. This is done to counteract code actively
-  // writing past the end of a chunk (like sqlite3) when the usable size allows
-  // for it, which then forces realloc to copy the usable size of a chunk as
-  // opposed to its actual size.
+  ALWAYS_INLINE uptr getUsableSize(const void *Ptr,
+                                   Chunk::UnpackedHeader *Header) {
+    void *BlockBegin = getBlockBegin(Ptr, Header);
+    if (LIKELY(Header->ClassId)) {
+      return SizeClassMap::getSizeByClassId(Header->ClassId) -
+             (reinterpret_cast<uptr>(Ptr) - reinterpret_cast<uptr>(BlockBegin));
+    }
+
+    uptr UntaggedPtr = reinterpret_cast<uptr>(Ptr);
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>()) {
+      UntaggedPtr = untagPointer(UntaggedPtr);
+      BlockBegin = untagPointer(BlockBegin);
+    }
+    return SecondaryT::getBlockEnd(BlockBegin) - UntaggedPtr;
+  }
+
+  // Return the usable size for a given chunk. If MTE is enabled or if the
+  // ExactUsableSize config parameter is true, we report the exact size of
+  // the original allocation size. Otherwise, we will return the total
+  // actual usable size.
   uptr getUsableSize(const void *Ptr) {
     if (UNLIKELY(!Ptr))
       return 0;
 
-    return getAllocSize(Ptr);
+    if (AllocatorConfig::getExactUsableSize() ||
+        UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load())))
+      return getAllocSize(Ptr);
+
+    initThreadMaybe();
+
+#ifdef GWP_ASAN_HOOKS
+    if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr)))
+      return GuardedAlloc.getSize(Ptr);
+#endif // GWP_ASAN_HOOKS
+
+    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, Ptr, &Header);
+
+    // Getting the alloc size of a chunk only makes sense if it's allocated.
+    if (UNLIKELY(Header.State != Chunk::State::Allocated))
+      reportInvalidChunkState(AllocatorAction::Sizing, Ptr);
+
+    return getUsableSize(Ptr, &Header);
   }
 
   uptr getAllocSize(const void *Ptr) {
@@ -951,6 +992,19 @@ class Allocator {
                          MemorySize, 2, 16);
   }
 
+  uptr getBlockBeginTestOnly(const void *Ptr) {
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, Ptr, &Header);
+    DCHECK(Header.State == Chunk::State::Allocated);
+
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+      Ptr = untagPointer(const_cast<void *>(Ptr));
+    void *Begin = getBlockBegin(Ptr, &Header);
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+      Begin = untagPointer(Begin);
+    return reinterpret_cast<uptr>(Begin);
+  }
+
 private:
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 5fdfd1e7c55cc..4837ac96b9b26 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -1152,6 +1152,248 @@ TEST(ScudoCombinedTest, QuarantineDisabled) {
   EXPECT_EQ(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
+struct UsableSizeClassConfig {
+  static const scudo::uptr NumBits = 1;
+  static const scudo::uptr MinSizeLog = 10;
+  static const scudo::uptr MidSizeLog = 10;
+  static const scudo::uptr MaxSizeLog = 13;
+  static const scudo::u16 MaxNumCachedHint = 8;
+  static const scudo::uptr MaxBytesCachedLog = 12;
+  static const scudo::uptr SizeDelta = 0;
+};
+
+struct TestExactUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = false;
+  static const bool QuarantineDisabled = true;
+
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
+
+  struct Primary {
+    // In order to properly test the usable size, this Primary config has
+    // four real size classes: 1024, 2048, 4096, 8192.
+    using SizeClassMap = scudo::FixedSizeClassMap<UsableSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+
+template <class AllocatorT> void VerifyExactUsableSize(AllocatorT &Allocator) {
+  // Scan through all sizes up to 10000 then some larger sizes.
+  for (scudo::uptr Size = 1; Size < 10000; Size++) {
+    void *P = Allocator.allocate(Size, Origin);
+    EXPECT_EQ(Size, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << Size;
+    Allocator.deallocate(P, Origin);
+  }
+
+  // Verify that aligned allocations also return the exact size allocated.
+  const scudo::uptr AllocSize = 313;
+  for (scudo::uptr Align = 1; Align <= 8; Align++) {
+    void *P = Allocator.allocate(AllocSize, Origin, 1U << Align);
+    EXPECT_EQ(AllocSize, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << AllocSize << " at align "
+        << 1 << Align;
+    Allocator.deallocate(P, Origin);
+  }
+
+  // Verify an explicitly large allocations.
+  const scudo::uptr LargeAllocSize = 1000000;
+  void *P = Allocator.allocate(LargeAllocSize, Origin);
+  EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P));
+  Allocator.deallocate(P, Origin);
+
+  // Now do it for aligned allocations for large allocations.
+  for (scudo::uptr Align = 1; Align <= 8; Align++) {
+    void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Align);
+    EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << AllocSize << " at align "
+        << 1 << Align;
+    Allocator.deallocate(P, Origin);
+  }
+}
+
+template <class AllocatorT>
+void VerifyIterateOverUsableSize(AllocatorT &Allocator) {
+  // This will not verify if the size is the exact size or the size of the
+  // size class. Instead verify that the size matches the usable size and
+  // assume the other tests have verified getUsableSize.
+  std::unordered_map<void *, size_t> Pointers;
+  Pointers.insert({Allocator.allocate(128, Origin), 0U});
+  Pointers.insert({Allocator.allocate(128, Origin, 32), 0U});
+  Pointers.insert({Allocator.allocate(2000, Origin), 0U});
+  Pointers.insert({Allocator.allocate(2000, Origin, 64), 0U});
+  Pointers.insert({Allocator.allocate(8000, Origin), 0U});
+  Pointers.insert({Allocator.allocate(8000, Origin, 128), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin, 128), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin, 256), 0U});
+
+  Allocator.disable();
+  Allocator.iterateOverChunks(
+      0, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1),
+      [](uintptr_t Base, size_t Size, void *Arg) {
+        std::unordered_map<void *, size_t> *Pointers =
+            reinterpret_cast<std::unordered_map<void *, size_t> *>(Arg);
+        (*Pointers)[reinterpret_cast<void *>(Base)] = Size;
+      },
+      reinterpret_cast<void *>(&Pointers));
+  Allocator.enable();
+
+  for (auto [Ptr, IterateSize] : Pointers) {
+    EXPECT_NE(0U, IterateSize)
+        << "Pointer " << Ptr << " not found in iterateOverChunks call.";
+    EXPECT_EQ(IterateSize, Allocator.getUsableSize(Ptr))
+        << "Pointer " << Ptr
+        << " mismatch between iterate size and usable size.";
+    Allocator.deallocate(Ptr, Origin);
+  }
+}
+
+TEST(ScudoCombinedTest, ExactUsableSize) {
+  using AllocatorT = scudo::Allocator<TestExactUsableSizeConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+struct TestExactUsableSizeMTEConfig : TestExactUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = true;
+};
+
+TEST(ScudoCombinedTest, ExactUsableSizeMTE) {
+  if (!scudo::archSupportsMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    TEST_SKIP("Only supported on systems that can enable MTE.");
+
+  scudo::enableSystemMemoryTaggingTestOnly();
+
+  using AllocatorT = scudo::Allocator<TestExactUsableSizeMTEConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+template <class AllocatorT>
+void VerifyUsableSizePrimary(AllocatorT &Allocator) {
+  std::vector<scudo::uptr> SizeClasses = {1024U, 2048U, 4096U, 8192U};
+  for (size_t I = 0; I < SizeClasses.size(); I++) {
+    scudo::uptr SizeClass = SizeClasses[I];
+    scudo::uptr StartSize;
+    if (I == 0)
+      StartSize = 1;
+    else
+      StartSize = SizeClasses[I - 1];
+    scudo::uptr UsableSize = SizeClass - scudo::Chunk::getHeaderSize();
+    for (scudo::uptr Size = StartSize; Size < UsableSize; Size++) {
+      void *P = Allocator.allocate(Size, Origin);
+      EXPECT_EQ(UsableSize, Allocator.getUsableSize(P))
+          << "Failed usable size at allocation size " << Size
+          << " for size class " << SizeClass;
+      memset(P, 0xff, UsableSize);
+      EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass,
+                reinterpret_cast<scudo::uptr>(P) + UsableSize);
+      Allocator.deallocate(P, Origin);
+    }
+
+    StartSize = UsableSize + 1;
+  }
+
+  std::vector<scudo::uptr> Alignments = {32U, 128U};
+  for (size_t I = 0; I < SizeClasses.size(); I++) {
+    scudo::uptr SizeClass = SizeClasses[I];
+    scudo::uptr AllocSize;
+    if (I == 0)
+      AllocSize = 1;
+    else
+      AllocSize = SizeClasses[I - 1] + 1;
+
+    for (auto Alignment : Alignments) {
+      void *P = Allocator.allocate(AllocSize, Origin, Alignment);
+      scudo::uptr UsableSize = Allocator.getUsableSize(P);
+      memset(P, 0xff, UsableSize);
+      EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass,
+                reinterpret_cast<scudo::uptr>(P) + UsableSize)
+          << "Failed usable size at allocation size " << AllocSize
+          << " for size class " << SizeClass << " at alignment " << Alignment;
+      Allocator.deallocate(P, Origin);
+    }
+  }
+}
+
+template <class AllocatorT>
+void VerifyUsableSizeSecondary(AllocatorT &Allocator) {
+  const scudo::uptr LargeAllocSize = 996780;
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+  void *P = Allocator.allocate(LargeAllocSize, Origin);
+  scudo::uptr UsableSize = Allocator.getUsableSize(P);
+  memset(P, 0xff, UsableSize);
+  // Assumes that the secondary always rounds up allocations to a page boundary.
+  EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize,
+                           PageSize),
+            reinterpret_cast<scudo::uptr>(P) + UsableSize);
+  Allocator.deallocate(P, Origin);
+
+  // Check aligned allocations now.
+  for (scudo::uptr Alignment = 1; Alignment <= 8; Alignment++) {
+    void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Alignment);
+    scudo::uptr UsableSize = Allocator.getUsableSize(P);
+    EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize,
+                             PageSize),
+              reinterpret_cast<scudo::uptr>(P) + UsableSize)
+        << "Failed usable size at allocation size " << LargeAllocSize
+        << " at alignment " << Alignment;
+    Allocator.deallocate(P, Origin);
+  }
+}
+
+struct TestFullUsableSizeConfig : TestExactUsableSizeConfig {
+  static const bool ExactUsableSize = false;
+};
+
+TEST(ScudoCombinedTest, FullUsableSize) {
+  using AllocatorT = scudo::Allocator<TestFullUsableSizeConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyUsableSizePrimary<AllocatorT>(*Allocator);
+  VerifyUsableSizeSecondary<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+struct TestFullUsableSizeMTEConfig : TestFullUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = true;
+};
+
+TEST(ScudoCombinedTest, FullUsableSizeMTE) {
+  if (!scudo::archSupportsMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    TEST_SKIP("Only supported on systems that can enable MTE.");
+
+  scudo::enableSystemMemoryTaggingTestOnly();
+
+  using AllocatorT = scudo::Allocator<TestFullUsableSizeMTEConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  // When MTE is enabled, you get exact sizes.
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
 // Verify that no special quarantine blocks appear in iterateOverChunks.
 TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
   using AllocatorT = TestAllocator<TestQuarantineConfig>;
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 612317b3c3293..9e5d0658e5ed5 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -588,8 +588,13 @@ TEST_F(ScudoWrappersCTest, MallocInfo) {
   EXPECT_EQ(errno, 0);
   fclose(F);
   EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0);
-  EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"1234\" count=\""));
-  EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"4321\" count=\""));
+  std::string expected;
+  expected =
+      "<alloc size=\"" + std::to_string(malloc_usable_size(P1)) + "\" count=\"";
+  EXPECT_NE(nullptr, strstr(Buffer, expected.c_str()));
+  expected =
+      "<alloc size=\"" + std::to_string(malloc_usable_size(P2)) + "\" count=\"";
+  EXPECT_NE(nullptr, strstr(Buffer, expected.c_str()));
 
   free(P1);
   free(P2);

From e987ab11a6f3d3965ef26fc42c82db3e8b1d56f5 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Mon, 3 Nov 2025 23:34:50 +0100
Subject: [PATCH 093/313] [Polly] Introduce PhaseManager and remove LPM support
 (#125442)

Instead of relying on any pass manager to schedule Polly's passes, add
Polly's own pipeline manager which is seen as a monolithic pass in
LLVM's pass manager. Polly's former passes are now phases of the new
PhaseManager component.

Relying on LLVM's pass manager (the legacy as well as the New Pass
Manager) to manage Polly's phases never was a good fit that the
PhaseManager resolves:

* Polly passes were modifying analysis results, in particular RegionInfo
and ScopInfo. This means that there was not just one unique and
"definite" analysis result, the actual result depended on which analyses
ran prior, and the pass manager was not allowed to throw away cached
analyses or prior SCoP optimizations would have been forgotten. The LLVM
pass manger's persistance of analysis results is not contractual but
designed for caching.

* Polly depends on a particular execution order of passes and regions
(e.g. regression tests, invalidation of consecutive SCoPs). LLVM's pass
manager does not guarantee any excecution order.

* Polly does not completely preserve DominatorTree, RegionInfo,
LoopInfo, or ScalarEvolution, but only as-needed for Polly's own uses.
Because the ScopDetection object stores references to those analyses, it
still had to lie to the pass manager that they would be preserved, or
the pass manager would have released and recomputed the invalidated
analysis objects that ScopDetection/ScopInfo was still referencing. To
ensure that no non-Polly pass would see these not-completely-preserved
analyses, all analyses still had to be thrown away after the
ScopPassManager, respectively with a BarrierNoopPass in case of the LPM.

* The NPM's PassInstrumentation wraps the IR unit into an `llvm::Any`
object, but implementations such as PrintIRInstrumentation call
llvm_unreachable on encountering an unknown IR unit, such as SCoPs, with
no extension points to add support. Hence LLVM crashes when dumping IR
between SCoP passes (such as `-print-before-changed` with Polly being
active).

The new PhaseManager uses some command line options that previously
belonged to Polly's legacy passes, such as `-polly-print-detect` (so the
option will continue to work). Hence the LPM support is incompatible
with the new approach and support for it is removed.
---
 polly/docs/ReleaseNotes.rst                   |   4 +
 polly/include/polly/Canonicalization.h        |   8 -
 polly/include/polly/CodeGen/CodeGeneration.h  |   3 +
 polly/include/polly/CodeGen/IslAst.h          |  36 +-
 polly/include/polly/CodePreparation.h         |   6 +
 polly/include/polly/DeLICM.h                  |  11 +-
 polly/include/polly/DeadCodeElimination.h     |  13 +-
 polly/include/polly/DependenceInfo.h          | 106 +----
 polly/include/polly/FlattenSchedule.h         |  16 +-
 polly/include/polly/ForwardOpTree.h           |  20 +-
 polly/include/polly/JSONExporter.h            |  13 +-
 polly/include/polly/LinkAllPasses.h           | 156 -------
 polly/include/polly/MaximalStaticExpansion.h  |   2 +
 polly/include/polly/Pass/PhaseManager.h       | 127 ++++++
 polly/include/polly/Pass/PollyFunctionPass.h  |  32 ++
 polly/include/polly/Pass/PollyModulePass.h    |  30 ++
 polly/include/polly/PruneUnprofitable.h       |  12 +-
 polly/include/polly/RegisterPasses.h          |   2 -
 polly/include/polly/ScheduleOptimizer.h       |  16 +-
 polly/include/polly/ScopDetection.h           |  27 --
 polly/include/polly/ScopGraphPrinter.h        |   3 +
 polly/include/polly/ScopInfo.h                |  76 ----
 polly/include/polly/ScopInliner.h             |   6 -
 polly/include/polly/ScopPass.h                |  28 --
 polly/include/polly/Simplify.h                |  23 +-
 .../include/polly/Support/DumpFunctionPass.h  |  12 -
 polly/include/polly/Support/DumpModulePass.h  |  15 +-
 polly/include/polly/Support/ScopHelper.h      |   8 -
 polly/lib/Analysis/DependenceInfo.cpp         | 220 +--------
 polly/lib/Analysis/PruneUnprofitable.cpp      |  32 +-
 polly/lib/Analysis/ScopBuilder.cpp            |   1 +
 polly/lib/Analysis/ScopDetection.cpp          | 107 -----
 polly/lib/Analysis/ScopGraphPrinter.cpp       | 120 +----
 polly/lib/Analysis/ScopInfo.cpp               | 228 ----------
 polly/lib/Analysis/ScopPass.cpp               |  36 --
 polly/lib/CMakeLists.txt                      |   3 +
 polly/lib/CodeGen/CodeGeneration.cpp          |  72 +--
 polly/lib/CodeGen/IslAst.cpp                  | 101 +----
 polly/lib/Exchange/JSONExporter.cpp           | 160 +------
 polly/lib/Pass/PhaseManager.cpp               | 424 ++++++++++++++++++
 polly/lib/Pass/PollyFunctionPass.cpp          |  22 +
 polly/lib/Pass/PollyModulePass.cpp            |  29 ++
 polly/lib/Support/DumpFunctionPass.cpp        |  41 --
 polly/lib/Support/DumpModulePass.cpp          |  47 --
 polly/lib/Support/PollyPasses.def             |  25 +-
 polly/lib/Support/RegisterPasses.cpp          | 401 +++++++++++------
 polly/lib/Support/ScopHelper.cpp              |  12 -
 polly/lib/Transform/Canonicalization.cpp      |  65 ---
 polly/lib/Transform/CodePreparation.cpp       |  69 ---
 polly/lib/Transform/DeLICM.cpp                | 125 ++----
 polly/lib/Transform/DeadCodeElimination.cpp   |  44 +-
 polly/lib/Transform/FlattenSchedule.cpp       | 139 ++----
 polly/lib/Transform/ForwardOpTree.cpp         | 131 ++----
 .../lib/Transform/MaximalStaticExpansion.cpp  |  81 +---
 polly/lib/Transform/ScheduleOptimizer.cpp     | 152 +------
 polly/lib/Transform/ScopInliner.cpp           |  46 --
 polly/lib/Transform/Simplify.cpp              | 101 +----
 polly/test/CodeGen/20100617.ll                |   2 +-
 polly/test/CodeGen/20100622.ll                |   4 +-
 polly/test/CodeGen/20100707.ll                |   2 +-
 polly/test/CodeGen/20100707_2.ll              |   2 +-
 polly/test/CodeGen/20100708.ll                |   2 +-
 polly/test/CodeGen/20100708_2.ll              |   2 +-
 polly/test/CodeGen/20100713.ll                |   2 +-
 polly/test/CodeGen/20100713_2.ll              |   2 +-
 polly/test/CodeGen/20100717.ll                |   2 +-
 polly/test/CodeGen/20100718-DomInfo-2.ll      |   2 +-
 polly/test/CodeGen/20100718-DomInfo.ll        |   2 +-
 .../CodeGen/20100720-MultipleConditions.ll    |   2 +-
 .../test/CodeGen/20100809-IndependentBlock.ll |   2 +-
 ...0100811-ScalarDependencyBetweenBrAndCnd.ll |   2 +-
 polly/test/CodeGen/20101030-Overflow.ll       |   2 +-
 polly/test/CodeGen/20101103-Overflow3.ll      |   2 +-
 polly/test/CodeGen/20101103-signmissmatch.ll  |   2 +-
 .../test/CodeGen/20110226-Ignore-Dead-Code.ll |   2 +-
 .../test/CodeGen/20110226-PHI-Node-removed.ll |   2 +-
 polly/test/CodeGen/20120316-InvalidCast.ll    |   2 +-
 .../CodeGen/20120403-RHS-type-mismatch.ll     |   2 +-
 polly/test/CodeGen/20130221.ll                |   2 +-
 .../20150328-SCEVExpanderIntroducesNewIV.ll   |   2 +-
 polly/test/CodeGen/Intrinsics/llvm-expect.ll  |   2 +-
 .../do_not_mutate_debug_info.ll               |   2 +-
 .../loop_nest_param_parallel.ll               |   2 +-
 .../single_loop_param_parallel.ll             |   4 +-
 polly/test/CodeGen/MemAccess/bad_alignment.ll |   2 +-
 .../MemAccess/codegen_address_space.ll        |   2 +-
 .../MemAccess/codegen_constant_offset.ll      |   2 +-
 .../test/CodeGen/MemAccess/codegen_simple.ll  |   2 +-
 .../CodeGen/MemAccess/codegen_simple_float.ll |   2 +-
 .../CodeGen/MemAccess/codegen_simple_md.ll    |   4 +-
 .../MemAccess/codegen_simple_md_float.ll      |   4 +-
 polly/test/CodeGen/MemAccess/create_arrays.ll |   4 +-
 .../CodeGen/MemAccess/create_arrays_heap.ll   |   4 +-
 .../default_aligned_new_access_function.ll    |   2 +-
 .../test/CodeGen/MemAccess/different_types.ll |   4 +-
 polly/test/CodeGen/MemAccess/generate-all.ll  |   6 +-
 .../CodeGen/MemAccess/invariant_base_ptr.ll   |   4 +-
 .../CodeGen/MemAccess/map_scalar_access.ll    |   4 +-
 .../test/CodeGen/MemAccess/multiple_types.ll  |   4 +-
 polly/test/CodeGen/MemAccess/simple.ll        |   2 +-
 .../test/CodeGen/MemAccess/simple_analyze.ll  |   2 +-
 .../MemAccess/update_access_functions.ll      |   4 +-
 .../CodeGen/Metadata/basic_vec_annotate.ll    |   2 +-
 polly/test/CodeGen/OpenMP/alias-metadata.ll   |   2 +-
 .../floord-as-argument-to-subfunction.ll      |   2 +-
 polly/test/CodeGen/OpenMP/inlineasm.ll        |   2 +-
 .../invariant_base_pointer_preloaded.ll       |   3 +-
 ...ant_base_pointer_preloaded_different_bb.ll |   3 +-
 ...base_pointer_preloaded_pass_only_needed.ll |   3 +-
 .../invariant_base_pointers_preloaded.ll      |   3 +-
 .../OpenMP/loop-body-references-outer-iv.ll   |   4 +-
 .../loop-body-references-outer-values-2.ll    |   4 +-
 .../loop-body-references-outer-values-3.ll    |   4 +-
 .../loop-body-references-outer-values.ll      |   4 +-
 .../OpenMP/loop-bounds-reference-outer-ids.ll |   4 +-
 .../test/CodeGen/OpenMP/mapped-phi-access.ll  |   2 +-
 polly/test/CodeGen/OpenMP/matmul-parallel.ll  |   4 +-
 .../CodeGen/OpenMP/new_multidim_access.ll     |   8 +-
 polly/test/CodeGen/OpenMP/recomputed-srem.ll  |   3 +-
 ...ference-argument-from-non-affine-region.ll |  19 +-
 .../test/CodeGen/OpenMP/reference-other-bb.ll |   2 +-
 .../OpenMP/reference-preceeding-loop.ll       |   4 +-
 polly/test/CodeGen/OpenMP/reference_latest.ll |   2 +-
 polly/test/CodeGen/OpenMP/scev-rewriting.ll   |   2 +-
 polly/test/CodeGen/OpenMP/single_loop.ll      |  18 +-
 ...single_loop_with_loop_invariant_baseptr.ll |   4 +-
 .../CodeGen/OpenMP/single_loop_with_param.ll  |  16 +-
 ...o-parallel-loops-reference-outer-indvar.ll |   4 +-
 polly/test/CodeGen/PHIInExit.ll               |   2 +-
 .../combine_different_values.ll               |   4 +-
 .../RuntimeDebugBuilder/stmt_tracing.ll       |   2 +-
 polly/test/CodeGen/alias-check-multi-dim.ll   |   3 +-
 .../CodeGen/alias_metadata_too_many_arrays.ll |   3 +-
 ...aliasing_different_base_and_access_type.ll |   2 +-
 .../aliasing_different_pointer_types.ll       |   2 +-
 .../aliasing_multidimensional_access.ll       |   2 +-
 .../CodeGen/aliasing_parametric_simple_1.ll   |   2 +-
 .../CodeGen/aliasing_parametric_simple_2.ll   |   2 +-
 polly/test/CodeGen/aliasing_struct_element.ll |   2 +-
 polly/test/CodeGen/alignment.ll               |   2 +-
 polly/test/CodeGen/annotated_alias_scopes.ll  |   2 +-
 polly/test/CodeGen/blas_sscal_simplified.ll   |   2 +-
 ...code-hosting-and-escape-map-computation.ll |   2 +-
 polly/test/CodeGen/constant_condition.ll      |   2 +-
 polly/test/CodeGen/create-conditional-scop.ll |   2 +-
 ...d_instruction_referenced_by_parameter_1.ll |   2 +-
 ...d_instruction_referenced_by_parameter_2.ll |   2 +-
 polly/test/CodeGen/debug-intrinsics.ll        |   8 +-
 ...nce_problem_after_early_codegen_bailout.ll |   2 +-
 polly/test/CodeGen/empty_domain_in_context.ll |   2 +-
 polly/test/CodeGen/entry_with_trivial_phi.ll  |   2 +-
 .../entry_with_trivial_phi_other_bb.ll        |   2 +-
 .../error-stmt-in-non-affine-region.ll        |   2 +-
 ...or_block_contains_invalid_memory_access.ll |   2 +-
 polly/test/CodeGen/exprModDiv.ll              |   7 +-
 .../hoisted_load_escapes_through_phi.ll       |   6 +-
 polly/test/CodeGen/hoisting_1.ll              |   2 +-
 polly/test/CodeGen/hoisting_2.ll              |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_1.ll       |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_2.ll       |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_3.ll       |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_in_lb.ll   |   4 +-
 .../inner_scev_sdiv_in_lb_invariant.ll        |   3 +-
 polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll  |   3 +-
 polly/test/CodeGen/intrinsics_lifetime.ll     |   2 +-
 polly/test/CodeGen/intrinsics_misc.ll         |   2 +-
 .../inv-load-lnt-crash-wrong-order-2.ll       |   3 +-
 .../inv-load-lnt-crash-wrong-order-3.ll       |   3 +-
 .../CodeGen/inv-load-lnt-crash-wrong-order.ll |   3 +-
 .../test/CodeGen/invariant-load-dimension.ll  |   4 +-
 ...-load-preload-base-pointer-origin-first.ll |   2 +-
 .../CodeGen/invariant_cannot_handle_void.ll   |   4 +-
 polly/test/CodeGen/invariant_load.ll          |   2 +-
 .../CodeGen/invariant_load_address_space.ll   |   2 +-
 .../CodeGen/invariant_load_alias_metadata.ll  |   3 +-
 .../CodeGen/invariant_load_base_pointer.ll    |   2 +-
 ...invariant_load_base_pointer_conditional.ll |   2 +-
 ...variant_load_base_pointer_conditional_2.ll |   6 +-
 ...ariant_load_canonicalize_array_baseptrs.ll |   4 +-
 .../test/CodeGen/invariant_load_condition.ll  |   2 +-
 .../invariant_load_different_sized_types.ll   |   3 +-
 polly/test/CodeGen/invariant_load_escaping.ll |   2 +-
 .../invariant_load_escaping_second_scop.ll    |   2 +-
 .../invariant_load_in_non_affine_subregion.ll |   2 +-
 polly/test/CodeGen/invariant_load_loop_ub.ll  |   2 +-
 ...ant_load_not_executed_but_in_parameters.ll |   2 +-
 .../test/CodeGen/invariant_load_outermost.ll  |   2 +-
 ...riant_load_parameters_cyclic_dependence.ll |   4 +-
 .../CodeGen/invariant_load_ptr_ptr_noalias.ll |   2 +-
 .../test/CodeGen/invariant_load_scalar_dep.ll |   2 +-
 ...riant_load_scalar_escape_alloca_sharing.ll |   2 +-
 ...oads_from_struct_with_different_types_1.ll |   2 +-
 ...oads_from_struct_with_different_types_2.ll |   2 +-
 ...invariant_loads_ignore_parameter_bounds.ll |   3 +-
 .../invariant_verify_function_failed.ll       |   2 +-
 .../invariant_verify_function_failed_2.ll     |   4 +-
 polly/test/CodeGen/issue56692.ll              |   2 +-
 .../large-numbers-in-boundary-context.ll      |   2 +-
 .../test/CodeGen/load_subset_with_context.ll  |   2 +-
 .../loop-invariant-load-type-mismatch.ll      |   2 +-
 polly/test/CodeGen/loop_with_condition.ll     |   2 +-
 polly/test/CodeGen/loop_with_condition_2.ll   |   2 +-
 .../test/CodeGen/loop_with_condition_ineq.ll  |   2 +-
 .../CodeGen/loop_with_condition_nested.ll     |   4 +-
 ..._conditional_entry_edge_split_hard_case.ll |   2 +-
 polly/test/CodeGen/memcpy_annotations.ll      |   2 +-
 .../multidim-non-matching-typesize-2.ll       |   3 +-
 .../CodeGen/multidim-non-matching-typesize.ll |   3 +-
 ..._2d_parametric_array_static_loop_bounds.ll |   2 +-
 polly/test/CodeGen/multidim_alias_check.ll    |   2 +-
 polly/test/CodeGen/multiple-codegens.ll       |   4 +-
 polly/test/CodeGen/multiple-scops-in-a-row.ll |   2 +-
 .../multiple-types-invariant-load-2.ll        |   3 +-
 .../CodeGen/multiple-types-invariant-load.ll  |   3 +-
 .../multiple_sai_fro_same_base_address.ll     |   4 +-
 polly/test/CodeGen/no-overflow-tracking.ll    |   4 +-
 polly/test/CodeGen/no_guard_bb.ll             |   2 +-
 ...non-affine-dominance-generated-entering.ll |   2 +-
 .../CodeGen/non-affine-exit-node-dominance.ll |   2 +-
 .../non-affine-phi-node-expansion-2.ll        |   3 +-
 .../non-affine-phi-node-expansion-3.ll        |   3 +-
 .../non-affine-phi-node-expansion-4.ll        |   3 +-
 .../CodeGen/non-affine-phi-node-expansion.ll  |   3 +-
 ...e-region-exit-phi-incoming-synthesize-2.ll |   2 +-
 ...ine-region-exit-phi-incoming-synthesize.ll |   2 +-
 .../non-affine-region-implicit-store.ll       |   2 +-
 ...ine-region-phi-references-in-scop-value.ll |   3 +-
 .../non-affine-subregion-dominance-reuse.ll   |   3 +-
 polly/test/CodeGen/non-affine-switch.ll       |   3 +-
 .../non-affine-synthesized-in-branch.ll       |   2 +-
 polly/test/CodeGen/non-affine-update.ll       |   3 +-
 .../non-hoisted-load-needed-as-base-ptr.ll    |   2 +-
 .../test/CodeGen/non_affine_float_compare.ll  |   4 +-
 .../CodeGen/only_non_affine_error_region.ll   |   2 +-
 polly/test/CodeGen/openmp_limit_threads.ll    |  12 +-
 .../test/CodeGen/out-of-scop-phi-node-use.ll  |   2 +-
 polly/test/CodeGen/param_div_div_div_2.ll     |   4 +-
 polly/test/CodeGen/partial_write_array.ll     |   2 +-
 polly/test/CodeGen/partial_write_emptyset.ll  |   2 +-
 ...l_write_full_write_that_appears_partial.ll |   2 +-
 .../partial_write_impossible_restriction.ll   |   2 +-
 polly/test/CodeGen/partial_write_in_region.ll |   5 +-
 .../partial_write_in_region_with_loop.ll      |   5 +-
 .../CodeGen/partial_write_mapped_scalar.ll    |   2 +-
 .../partial_write_mapped_scalar_subregion.ll  |   2 +-
 polly/test/CodeGen/perf_monitoring.ll         |   3 +-
 .../perf_monitoring_cycles_per_scop.ll        |   3 +-
 .../perf_monitoring_trip_counts_per_scop.ll   |   3 +-
 polly/test/CodeGen/phi-defined-before-scop.ll |   2 +-
 .../phi_after_error_block_outside_of_scop.ll  |   2 +-
 .../test/CodeGen/phi_condition_modeling_1.ll  |   2 +-
 .../test/CodeGen/phi_condition_modeling_2.ll  |   2 +-
 .../test/CodeGen/phi_conditional_simple_1.ll  |   4 +-
 .../phi_in_exit_early_lnt_failure_1.ll        |   2 +-
 .../phi_in_exit_early_lnt_failure_2.ll        |   2 +-
 .../phi_in_exit_early_lnt_failure_3.ll        |   2 +-
 .../phi_in_exit_early_lnt_failure_5.ll        |   2 +-
 polly/test/CodeGen/phi_loop_carried_float.ll  |   2 +-
 .../CodeGen/phi_loop_carried_float_escape.ll  |   6 +-
 polly/test/CodeGen/phi_scalar_simple_1.ll     |   2 +-
 polly/test/CodeGen/phi_scalar_simple_2.ll     |   2 +-
 .../CodeGen/phi_with_multi_exiting_edges_2.ll |   2 +-
 polly/test/CodeGen/phi_with_one_exit_edge.ll  |   2 +-
 .../CodeGen/pointer-type-expressions-2.ll     |   4 +-
 .../test/CodeGen/pointer-type-expressions.ll  |   4 +-
 .../pointer-type-pointer-type-comparison.ll   |   4 +-
 polly/test/CodeGen/pointer_rem.ll             |   4 +-
 polly/test/CodeGen/pr25241.ll                 |   2 +-
 polly/test/CodeGen/ptrtoint_as_parameter.ll   |   2 +-
 polly/test/CodeGen/read-only-scalars.ll       |   8 +-
 polly/test/CodeGen/reduction.ll               |   2 +-
 polly/test/CodeGen/reduction_2.ll             |   2 +-
 polly/test/CodeGen/reduction_simple_binary.ll |   2 +-
 polly/test/CodeGen/reggen_domtree_crash.ll    |   2 +-
 .../test/CodeGen/region-with-instructions.ll  |   2 +-
 polly/test/CodeGen/region_exiting-domtree.ll  |   2 +-
 .../CodeGen/region_multiexit_partialwrite.ll  |   2 +-
 ...run-time-condition-with-scev-parameters.ll |   4 +-
 polly/test/CodeGen/run-time-condition.ll      |   2 +-
 .../scalar-references-used-in-scop-compute.ll |   2 +-
 .../test/CodeGen/scalar-store-from-same-bb.ll |   3 +-
 polly/test/CodeGen/scalar_codegen_crash.ll    |   3 +-
 polly/test/CodeGen/scev-backedgetaken.ll      |   2 +-
 .../CodeGen/scev-division-invariant-load.ll   |   2 +-
 polly/test/CodeGen/scev.ll                    |   2 +-
 .../CodeGen/scev_expansion_in_nonaffine.ll    |   3 +-
 .../CodeGen/scev_looking_through_bitcasts.ll  |   2 +-
 .../CodeGen/scop_expander_insert_point.ll     |   3 +-
 polly/test/CodeGen/scop_expander_segfault.ll  |   2 +-
 ...p_never_executed_runtime_check_location.ll |   2 +-
 polly/test/CodeGen/select-base-pointer.ll     |   2 +-
 polly/test/CodeGen/sequential_loops.ll        |   2 +-
 .../CodeGen/simple_loop_non_single_exit.ll    |   2 +-
 .../CodeGen/simple_loop_non_single_exit_2.ll  |   2 +-
 polly/test/CodeGen/simple_non_single_entry.ll |   2 +-
 polly/test/CodeGen/simple_nonaffine_loop.ll   |   2 +-
 .../single_do_loop_int_max_iterations.ll      |   2 +-
 .../single_do_loop_int_param_iterations.ll    |   2 +-
 .../single_do_loop_ll_max_iterations.ll       |   4 +-
 .../CodeGen/single_do_loop_one_iteration.ll   |   2 +-
 .../CodeGen/single_do_loop_scev_replace.ll    |   2 +-
 polly/test/CodeGen/single_loop.ll             |   2 +-
 .../CodeGen/single_loop_int_max_iterations.ll |   2 +-
 .../CodeGen/single_loop_ll_max_iterations.ll  |   2 +-
 .../test/CodeGen/single_loop_one_iteration.ll |   2 +-
 polly/test/CodeGen/single_loop_param.ll       |   2 +-
 .../CodeGen/single_loop_param_less_equal.ll   |   6 +-
 .../CodeGen/single_loop_param_less_than.ll    |   4 +-
 .../CodeGen/single_loop_zero_iterations.ll    |   2 +-
 polly/test/CodeGen/split_edge_of_exit.ll      |   4 +-
 polly/test/CodeGen/split_edges.ll             |   2 +-
 polly/test/CodeGen/split_edges_2.ll           |   2 +-
 polly/test/CodeGen/srem-in-other-bb.ll        |   3 +-
 .../stack-overflow-in-load-hoisting.ll        |   3 +-
 .../test/CodeGen/stmt_split_no_dependence.ll  |   2 +-
 .../CodeGen/switch-in-non-affine-region.ll    |   3 +-
 .../synthesizable_phi_write_after_loop.ll     |   2 +-
 .../test-invalid-operands-for-select-2.ll     |   2 +-
 .../test-invalid-operands-for-select.ll       |   2 +-
 polly/test/CodeGen/test.ll                    |   2 +-
 .../two-loops-right-after-each-other-2.ll     |   2 +-
 .../two-scops-in-row-invalidate-scevs.ll      |   2 +-
 polly/test/CodeGen/two-scops-in-row.ll        |   4 +-
 polly/test/CodeGen/udiv_expansion_position.ll |   2 +-
 .../CodeGen/uninitialized_scalar_memory.ll    |   2 +-
 .../unpredictable-loop-unsynthesizable.ll     |   6 +-
 .../test/CodeGen/variant_load_empty_domain.ll |   2 +-
 .../whole-scop-non-affine-subregion.ll        |   3 +-
 polly/test/DeLICM/confused_order.ll           |   4 +-
 ...ontradicting_assumed_context_and_domain.ll |   2 +-
 polly/test/DeLICM/load-in-cond-inf-loop.ll    |   2 +-
 polly/test/DeLICM/map_memset_zero.ll          |   4 +-
 polly/test/DeLICM/nomap_alreadymapped.ll      |   2 +-
 polly/test/DeLICM/nomap_escaping.ll           |   2 +-
 polly/test/DeLICM/nomap_occupied.ll           |   2 +-
 polly/test/DeLICM/nomap_readonly.ll           |   2 +-
 polly/test/DeLICM/nomap_spuriouswrite.ll      |   2 +-
 polly/test/DeLICM/nomap_storagesize.ll        |   2 +-
 polly/test/DeLICM/nomap_writewrite.ll         |   2 +-
 polly/test/DeLICM/outofquota-reverseDomain.ll |   2 +-
 polly/test/DeLICM/pass_existence.ll           |   6 +-
 polly/test/DeLICM/pr41656.ll                  |   2 +-
 polly/test/DeLICM/pr48783.ll                  |   2 +-
 polly/test/DeLICM/reduction.ll                |   2 +-
 .../DeLICM/reduction_constant_selfconflict.ll |   2 +-
 polly/test/DeLICM/reduction_looprotate.ll     |   2 +-
 .../reduction_looprotate_alwaystaken.ll       |   2 +-
 .../DeLICM/reduction_looprotate_gvnpre.ll     |   4 +-
 .../reduction_looprotate_gvnpre_cond1.ll      |   2 +-
 .../reduction_looprotate_gvnpre_cond2.ll      |   2 +-
 ...reduction_looprotate_gvnpre_nopreheader.ll |   2 +-
 .../DeLICM/reduction_looprotate_hoisted.ll    |   2 +-
 .../test/DeLICM/reduction_looprotate_licm.ll  |   2 +-
 .../test/DeLICM/reduction_looprotate_licm2.ll |   2 +-
 .../reduction_looprotate_licm_double_write.ll |   5 +-
 .../reduction_looprotate_licm_nopreheader.ll  |   2 +-
 .../test/DeLICM/reduction_looprotate_load.ll  |   2 +-
 .../reduction_looprotate_loopguard_gvnpre.ll  |   2 +-
 .../reduction_looprotate_loopguard_licm1.ll   |   2 +-
 .../reduction_looprotate_loopguard_licm2.ll   |   2 +-
 .../reduction_looprotate_loopguard_licm3.ll   |   2 +-
 .../DeLICM/reduction_looprotate_readonly.ll   |   2 +-
 .../reduction_looprotate_synthesizable.ll     |   2 +-
 .../test/DeLICM/reduction_looprotate_undef.ll |   2 +-
 .../test/DeLICM/reduction_overapproximate.ll  |   6 +-
 polly/test/DeLICM/reduction_preheader.ll      |   2 +-
 .../test/DeLICM/reduction_unrelatedunusual.ll |   2 +-
 polly/test/DeLICM/reject_loadafterstore.ll    |   2 +-
 polly/test/DeLICM/reject_outofquota.ll        |   4 +-
 polly/test/DeLICM/reject_storeafterstore.ll   |   2 +-
 polly/test/DeLICM/reject_storeinsubregion.ll  |   2 +-
 polly/test/DeLICM/reject_unusualstore.ll      |   4 +-
 polly/test/DeLICM/skip_maywrite.ll            |   2 +-
 polly/test/DeLICM/skip_multiaccess.ll         |   2 +-
 polly/test/DeLICM/skip_notinloop.ll           |   2 +-
 polly/test/DeLICM/skip_scalaraccess.ll        |   2 +-
 .../DeadCodeElimination/chained_iterations.ll |   4 +-
 .../chained_iterations_2.ll                   |   4 +-
 polly/test/DeadCodeElimination/computeout.ll  |   4 +-
 .../dead_iteration_elimination.ll             |   2 +-
 .../non-affine-affine-mix.ll                  |   2 +-
 polly/test/DeadCodeElimination/non-affine.ll  |   2 +-
 .../test/DeadCodeElimination/null_schedule.ll |   2 +-
 polly/test/DependenceInfo/computeout.ll       |   4 +-
 .../different_schedule_dimensions.ll          |   3 +-
 polly/test/DependenceInfo/do_pluto_matmult.ll |   4 +-
 polly/test/DependenceInfo/fine_grain_dep_0.ll |   4 +-
 .../generate_may_write_dependence_info.ll     |   2 +-
 .../test/DependenceInfo/infeasible_context.ll |   6 +-
 ...writes_do_not_block_must_writes_for_war.ll |   2 +-
 .../nonaffine-condition-buildMemoryAccess.ll  |   2 +-
 .../reduction_complex_location.ll             |   6 +-
 ...ndences_equal_non_reduction_dependences.ll |   2 +-
 .../reduction_dependences_not_null.ll         |   2 +-
 .../reduction_indirect_access.ll              |   2 +-
 ...reduction_and_non_reduction_dependences.ll |   2 +-
 .../reduction_multiple_loops_array_sum.ll     |   6 +-
 .../reduction_multiple_loops_array_sum_2.ll   |   2 +-
 .../reduction_multiple_loops_array_sum_3.ll   |   2 +-
 .../reduction_multiple_reductions.ll          |   2 +-
 .../reduction_multiple_reductions_2.ll        |   2 +-
 .../reduction_only_reduction_like_access.ll   |   2 +-
 ...lly_escaping_intermediate_in_other_stmt.ll |   2 +-
 .../reduction_privatization_deps.ll           |   2 +-
 .../reduction_privatization_deps_2.ll         |   2 +-
 .../reduction_privatization_deps_3.ll         |   2 +-
 .../reduction_privatization_deps_4.ll         |   2 +-
 .../reduction_privatization_deps_5.ll         |   2 +-
 .../test/DependenceInfo/reduction_sequence.ll |   2 +-
 .../DependenceInfo/reduction_simple_iv.ll     |   2 +-
 ...ion_simple_iv_debug_wrapped_dependences.ll |   2 +-
 .../reduction_simple_privatization_deps_2.ll  |   2 +-
 ...n_simple_privatization_deps_w_parameter.ll |   2 +-
 ...duction_two_reductions_different_rloops.ll |   2 +-
 polly/test/DependenceInfo/sequential_loops.ll |   6 +-
 polly/test/FlattenSchedule/gemm.ll            |   2 +-
 polly/test/ForwardOpTree/atax.ll              |   2 +-
 polly/test/ForwardOpTree/changed-kind.ll      |   2 +-
 .../test/ForwardOpTree/forward_from_region.ll |   2 +-
 polly/test/ForwardOpTree/forward_hoisted.ll   |   2 +-
 .../test/ForwardOpTree/forward_instruction.ll |   2 +-
 .../test/ForwardOpTree/forward_into_region.ll |   2 +-
 .../forward_into_region_redundant_use.ll      |   2 +-
 polly/test/ForwardOpTree/forward_load.ll      |   2 +-
 .../forward_load_differentarray.ll            |   2 +-
 .../forward_load_double_write.ll              |   2 +-
 .../ForwardOpTree/forward_load_fromloop.ll    |   2 +-
 .../ForwardOpTree/forward_load_indirect.ll    |   2 +-
 .../forward_load_memset_after.ll              |   2 +-
 .../forward_load_memset_before.ll             |   2 +-
 .../ForwardOpTree/forward_load_tripleuse.ll   |   2 +-
 .../forward_load_unrelatedunusual.ll          |   2 +-
 polly/test/ForwardOpTree/forward_phi_load.ll  |   2 +-
 polly/test/ForwardOpTree/forward_readonly.ll  |   4 +-
 polly/test/ForwardOpTree/forward_reusue.ll    |   2 +-
 polly/test/ForwardOpTree/forward_store.ll     |   2 +-
 .../forward_synthesizable_definloop.ll        |   2 +-
 .../forward_synthesizable_indvar.ll           |   2 +-
 .../forward_synthesizable_useinloop.ll        |   2 +-
 .../test/ForwardOpTree/forward_transitive.ll  |   2 +-
 polly/test/ForwardOpTree/jacobi-1d.ll         |   2 +-
 .../ForwardOpTree/noforward_from_region.ll    |   2 +-
 .../noforward_load_conditional.ll             |   2 +-
 .../noforward_load_writebetween.ll            |   2 +-
 .../ForwardOpTree/noforward_outofquota.ll     |   4 +-
 polly/test/ForwardOpTree/noforward_partial.ll |   2 +-
 polly/test/ForwardOpTree/noforward_phi.ll     |   2 +-
 .../ForwardOpTree/noforward_selfrefphi.ll     |   2 +-
 .../ForwardOpTree/noforward_sideffects.ll     |   2 +-
 .../noforward_synthesizable_unknownit.ll      |   2 +-
 polly/test/ForwardOpTree/out-of-quota1.ll     |   2 +-
 .../OpenMP/multiple_loops_outer_parallel.ll   |   2 +-
 .../OpenMP/nested_loop_both_parallel.ll       |   2 +-
 .../nested_loop_both_parallel_parametric.ll   |   2 +-
 .../OpenMP/nested_loop_inner_parallel.ll      |   2 +-
 .../OpenMP/nested_loop_outer_parallel.ll      |   2 +-
 .../OpenMP/single_loop_param_non_parallel.ll  |   2 +-
 .../OpenMP/single_loop_param_parallel.ll      |   2 +-
 .../single_loop_param_parallel_computeout.ll  |   2 +-
 .../alias_checks_with_empty_context.ll        |   3 +-
 polly/test/IstAstInfo/alias_simple_1.ll       |  10 +-
 polly/test/IstAstInfo/alias_simple_2.ll       |  12 +-
 polly/test/IstAstInfo/alias_simple_3.ll       |  10 +-
 .../aliasing_arrays_with_identical_base.ll    |   4 +-
 .../aliasing_multiple_alias_groups.ll         |   4 +-
 .../aliasing_parametric_simple_1.ll           |   2 +-
 .../aliasing_parametric_simple_2.ll           |   2 +-
 .../dependence_distance_constant.ll           |   2 +-
 .../IstAstInfo/dependence_distance_minimal.ll |   2 +-
 .../dependence_distance_multiple_constant.ll  |   2 +-
 .../dependence_distance_parametric.ll         |   2 +-
 .../dependence_distance_parametric_expr.ll    |   2 +-
 .../IstAstInfo/dependence_distance_varying.ll |   2 +-
 ...pendence_distance_varying_in_outer_loop.ll |   2 +-
 .../dependence_distance_varying_multiple.ll   |   2 +-
 .../domain_bounded_only_with_context.ll       |   2 +-
 polly/test/IstAstInfo/non_affine_access.ll    |   2 +-
 ...duction_clauses_multidimensional_access.ll |   2 +-
 ...reduction_clauses_onedimensional_access.ll |   2 +-
 ...ndences_equal_non_reduction_dependences.ll |   2 +-
 .../reduction_different_reduction_clauses.ll  |   2 +-
 .../IstAstInfo/reduction_in_one_dimension.ll  |   2 +-
 .../IstAstInfo/reduction_loop_reversal.ll     |   2 +-
 ...ction_modulo_and_loop_reversal_schedule.ll |   2 +-
 ...ion_modulo_and_loop_reversal_schedule_2.ll |   2 +-
 .../IstAstInfo/reduction_modulo_schedule.ll   |   2 +-
 ...ion_modulo_schedule_multiple_dimensions.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_2.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_3.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_4.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_5.ll |   2 +-
 .../reduction_multiple_dimensions.ll          |   2 +-
 .../reduction_multiple_dimensions_2.ll        |   2 +-
 .../reduction_multiple_dimensions_3.ll        |   2 +-
 .../reduction_multiple_dimensions_4.ll        |   2 +-
 polly/test/IstAstInfo/run-time-condition.ll   |   2 +-
 .../runtime_context_with_error_blocks.ll      |   2 +-
 .../IstAstInfo/simple-run-time-condition.ll   |   2 +-
 .../test/IstAstInfo/single_loop_strip_mine.ll |   4 +-
 .../single_loop_uint_max_iterations.ll        |   2 +-
 .../single_loop_ull_max_iterations.ll         |   2 +-
 .../ImportAccesses-Bad-relation.ll            |   2 +-
 .../ImportAccesses-No-accesses-key.ll         |   2 +-
 .../ImportAccesses-Not-enough-MemAcc.ll       |   2 +-
 .../ImportAccesses-Not-enough-statements.ll   |   2 +-
 .../ImportAccesses-Relation-mispelled.ll      |   2 +-
 .../ImportAccesses-Statements-mispelled.ll    |   2 +-
 ...ImportAccesses-Undeclared-ScopArrayInfo.ll |   2 +-
 .../ImportAccesses-Wrong-number-dimensions.ll |   2 +-
 .../ImportArrays-Mispelled-type.ll            |   2 +-
 .../ImportArrays-Negative-size.ll             |   2 +-
 .../ImportArrays/ImportArrays-No-name.ll      |   2 +-
 .../ImportArrays/ImportArrays-No-sizes-key.ll |   2 +-
 .../ImportArrays/ImportArrays-No-type-key.ll  |   2 +-
 .../ImportContext-Context-mispelled.ll        |   2 +-
 .../ImportContext-Not-parameter-set.ll        |   2 +-
 .../ImportContext-Unvalid-Context.ll          |   2 +-
 .../ImportContext-Wrong-dimension.ll          |   2 +-
 .../ImportSchedule-No-schedule-key.ll         |   2 +-
 .../ImportSchedule-Schedule-not-valid.ll      |   2 +-
 .../ImportSchedule-Statements-mispelled.ll    |   2 +-
 .../ImportSchedule-Wrong-number-statements.ll |   2 +-
 .../load_after_store_same_statement.ll        |   4 +-
 .../read_from_original.ll                     |   4 +-
 .../MaximalStaticExpansion/too_many_writes.ll |   4 +-
 .../working_deps_between_inners.ll            |   2 +-
 .../working_deps_between_inners_phi.ll        |   4 +-
 .../working_expansion.ll                      |   2 +-
 ...sion_multiple_dependences_per_statement.ll |   2 +-
 ...sion_multiple_instruction_per_statement.ll |   2 +-
 .../working_phi_expansion.ll                  |   4 +-
 .../working_phi_two_scalars.ll                |   4 +-
 .../working_value_expansion.ll                |   2 +-
 .../prune_only_scalardeps.ll                  |   2 +-
 .../2012-03-16-Empty-Domain.ll                |   2 +-
 .../2013-04-11-Empty-Domain-two.ll            |   2 +-
 .../GreedyFuse/fuse-double.ll                 |   4 +-
 .../GreedyFuse/fuse-except-first.ll           |   4 +-
 .../GreedyFuse/fuse-except-third.ll           |   4 +-
 .../GreedyFuse/fuse-inner-carried.ll          |   4 +-
 .../GreedyFuse/fuse-inner-third.ll            |   4 +-
 .../GreedyFuse/fuse-inner.ll                  |   4 +-
 .../GreedyFuse/fuse-simple.ll                 |   4 +-
 .../GreedyFuse/nofuse-simple.ll               |   4 +-
 .../GreedyFuse/nofuse-with-middle.ll          |   4 +-
 .../ManualOptimization/disable_nonforced.ll   |   2 +-
 .../distribute_heuristic.ll                   |   4 +-
 .../distribute_illegal_looploc.ll             |   2 +-
 .../distribute_illegal_pragmaloc.ll           |   2 +-
 .../ManualOptimization/unroll_disable.ll      |   2 +-
 .../ManualOptimization/unroll_double.ll       |   2 +-
 .../ManualOptimization/unroll_full.ll         |   2 +-
 .../ManualOptimization/unroll_heuristic.ll    |   4 +-
 .../ManualOptimization/unroll_partial.ll      |   4 +-
 .../unroll_partial_followup.ll                |   8 +-
 .../ScheduleOptimizer/SIMDInParallelFor.ll    |   2 +-
 polly/test/ScheduleOptimizer/computeout.ll    |   4 +-
 .../ensure-correct-tile-sizes.ll              |   7 +-
 .../focaltech_test_detail_threshold-7bc17e.ll |   2 +-
 .../full_partial_tile_separation.ll           |   2 +-
 polly/test/ScheduleOptimizer/line-tiling-2.ll |   2 +-
 polly/test/ScheduleOptimizer/line-tiling.ll   |   2 +-
 .../mat_mul_pattern_data_layout.ll            |  11 +-
 .../mat_mul_pattern_data_layout_2.ll          |  10 +-
 .../ScheduleOptimizer/one-dimensional-band.ll |   2 +-
 .../ScheduleOptimizer/outer_coincidence.ll    |   4 +-
 ...attern-matching-based-opts-after-delicm.ll |   6 +-
 ...tern-matching-based-opts-after-delicm_2.ll |   4 +-
 .../pattern-matching-based-opts.ll            |   9 +-
 .../pattern-matching-based-opts_11.ll         |  14 +-
 .../pattern-matching-based-opts_12.ll         |  10 +-
 .../pattern-matching-based-opts_13.ll         |  10 +-
 .../pattern-matching-based-opts_14.ll         |  11 +-
 .../pattern-matching-based-opts_15.ll         |   4 +-
 .../pattern-matching-based-opts_16.ll         |   3 +-
 .../pattern-matching-based-opts_17.ll         |   3 +-
 .../pattern-matching-based-opts_18.ll         |   3 +-
 .../pattern-matching-based-opts_19.ll         |   3 +-
 .../pattern-matching-based-opts_2.ll          |   3 +-
 .../pattern-matching-based-opts_20.ll         |   3 +-
 .../pattern-matching-based-opts_21.ll         |   3 +-
 .../pattern-matching-based-opts_22.ll         |   3 +-
 .../pattern-matching-based-opts_24.ll         |   4 +-
 .../pattern-matching-based-opts_25.ll         |   3 +-
 .../pattern-matching-based-opts_3.ll          |  17 +-
 .../pattern-matching-based-opts_4.ll          |  12 +-
 .../pattern-matching-based-opts_5.ll          |  10 +-
 .../pattern-matching-based-opts_6.ll          |  10 +-
 .../pattern-matching-based-opts_7.ll          |  10 +-
 .../pattern-matching-based-opts_8.ll          |  10 +-
 .../pattern-matching-based-opts_9.ll          |  12 +-
 .../pattern_matching_based_opts_splitmap.ll   |   2 +-
 .../prevectorization-without-tiling.ll        |   2 +-
 .../ScheduleOptimizer/prevectorization.ll     |   4 +-
 .../prevectorization_islbound.ll              |   2 +-
 .../ScheduleOptimizer/rectangular-tiling.ll   |   8 +-
 .../ScheduleOptimizer/schedule_computeout.ll  |   2 +-
 polly/test/ScheduleOptimizer/statistics.ll    |   2 +-
 .../ScheduleOptimizer/tile_after_fusion.ll    |   2 +-
 ...vivid_vbi_gen_sliced-before-llvmreduced.ll |   2 +-
 .../aliasing_parametric_simple_1.ll           |   2 +-
 .../aliasing_parametric_simple_2.ll           |   2 +-
 polly/test/ScopDetect/aliasing_simple_1.ll    |   2 +-
 polly/test/ScopDetect/aliasing_simple_2.ll    |   2 +-
 polly/test/ScopDetect/base_pointer.ll         |   2 +-
 .../base_pointer_load_setNewAccessRelation.ll |   2 +-
 .../base_pointer_setNewAccessRelation.ll      |   4 +-
 polly/test/ScopDetect/callbr.ll               |   4 +-
 .../ScopDetect/collective_invariant_loads.ll  |   2 +-
 .../ScopDetect/cross_loop_non_single_exit.ll  |   2 +-
 .../cross_loop_non_single_exit_2.ll           |   2 +-
 ...ependency_to_phi_node_outside_of_region.ll |   2 +-
 .../test/ScopDetect/detect-full-functions.ll  |   2 +-
 polly/test/ScopDetect/dom-tree-crash.ll       |   2 +-
 polly/test/ScopDetect/dot-scops-npm.ll        |   2 +-
 polly/test/ScopDetect/dot-scops.ll            |   2 +-
 .../ScopDetect/error-block-always-executed.ll |   2 +-
 .../error-block-referenced-from-scop.ll       |   2 +-
 .../ScopDetect/error-block-unreachable.ll     |   2 +-
 .../ScopDetect/expand-region-correctly-2.ll   |   2 +-
 .../ScopDetect/expand-region-correctly.ll     |   2 +-
 .../test/ScopDetect/ignore_func_flag_regex.ll |   2 +-
 .../index_from_unpredictable_loop.ll          |   4 +-
 .../index_from_unpredictable_loop2.ll         |   4 +-
 polly/test/ScopDetect/indvars.ll              |   2 +-
 polly/test/ScopDetect/intrinsics_1.ll         |   2 +-
 polly/test/ScopDetect/intrinsics_2.ll         |   2 +-
 polly/test/ScopDetect/intrinsics_3.ll         |   2 +-
 .../ScopDetect/invalid-latch-conditions.ll    |   6 +-
 .../ScopDetect/invalidate_scalar_evolution.ll |   2 +-
 .../ScopDetect/invariant-load-before-scop.ll  |   2 +-
 polly/test/ScopDetect/keep_going_expansion.ll |   2 +-
 polly/test/ScopDetect/mod_ref_read_pointer.ll |   4 +-
 polly/test/ScopDetect/more-than-one-loop.ll   |   4 +-
 .../ScopDetect/multidim-with-undef-size.ll    |   2 +-
 polly/test/ScopDetect/multidim.ll             |   2 +-
 .../ScopDetect/multidim_indirect_access.ll    |   2 +-
 ..._two_accesses_different_delinearization.ll |   2 +-
 .../ScopDetect/nested_loop_single_exit.ll     |   4 +-
 .../test/ScopDetect/non-affine-conditional.ll |   2 +-
 .../ScopDetect/non-affine-float-compare.ll    |   2 +-
 ...-affine-loop-condition-dependent-access.ll |   8 +-
 ...ffine-loop-condition-dependent-access_2.ll |   6 +-
 ...ffine-loop-condition-dependent-access_3.ll |   6 +-
 polly/test/ScopDetect/non-affine-loop.ll      |  10 +-
 .../non-beneficial-loops-small-trip-count.ll  |   2 +-
 .../non-constant-add-rec-start-expr.ll        |   2 +-
 .../ScopDetect/non-simple-memory-accesses.ll  |   2 +-
 .../ScopDetect/non_affine_loop_condition.ll   |   4 +-
 polly/test/ScopDetect/only-one-affine-loop.ll |   2 +-
 polly/test/ScopDetect/only_func_flag.ll       |   2 +-
 polly/test/ScopDetect/only_func_flag_regex.ll |   2 +-
 .../parametric-multiply-in-scev-2.ll          |   2 +-
 .../ScopDetect/parametric-multiply-in-scev.ll |   2 +-
 .../phi_with_multi_exiting_edges.ll           |   2 +-
 .../profitability-large-basic-blocks.ll       |  10 +-
 .../profitability-two-nested-loops.ll         |   2 +-
 polly/test/ScopDetect/remove_all_children.ll  |   2 +-
 polly/test/ScopDetect/report-scop-location.ll |   2 +-
 .../restrict-undef-size-scopdetect.ll         |   2 +-
 polly/test/ScopDetect/run_time_alias_check.ll |   2 +-
 polly/test/ScopDetect/scev_remove_max.ll      |   2 +-
 polly/test/ScopDetect/sequential_loops.ll     |   2 +-
 polly/test/ScopDetect/simple_loop.ll          |   2 +-
 .../simple_loop_non_single_entry.ll           |   2 +-
 .../ScopDetect/simple_loop_non_single_exit.ll |   2 +-
 .../simple_loop_non_single_exit_2.ll          |   2 +-
 .../ScopDetect/simple_loop_two_phi_nodes.ll   |   2 +-
 .../test/ScopDetect/simple_loop_with_param.ll |   2 +-
 .../ScopDetect/simple_loop_with_param_2.ll    |   2 +-
 .../ScopDetect/simple_non_single_entry.ll     |   2 +-
 .../ScopDetect/skip_function_attribute.ll     |   2 +-
 .../srem_with_parametric_divisor.ll           |   2 +-
 polly/test/ScopDetect/statistics.ll           |   2 +-
 polly/test/ScopDetect/switch-in-loop-patch.ll |   2 +-
 .../test/ScopDetect/tlr_is_hoistable_load.ll  |   2 +-
 .../ReportAlias-01.ll                         |   2 +-
 .../ScopDetectionDiagnostics/ReportEntry.ll   |   2 +-
 .../ReportFuncCall-01.ll                      |   2 +-
 .../ReportIrreducibleRegion.ll                |   2 +-
 .../ReportIrreducibleRegionWithoutDebugLoc.ll |   2 +-
 .../ReportLoopBound-01.ll                     |  16 +-
 .../ReportLoopHasNoExit.ll                    |   4 +-
 .../ReportMultipleNonAffineAccesses.ll        |  12 +-
 .../ReportNonAffineAccess-01.ll               |   2 +-
 .../ReportUnprofitable.ll                     |   8 +-
 .../ReportUnreachableInExit.ll                |   3 +-
 .../ReportVariantBasePtr-01.ll                |   2 +-
 .../loop_has_multiple_exits.ll                |   2 +-
 .../loop_partially_in_scop-2.ll               |   2 +-
 .../loop_partially_in_scop.ll                 |   2 +-
 .../ScopInfo/20110312-Fail-without-basicaa.ll |   2 +-
 .../20111108-Parameter-not-detected.ll        |   2 +-
 ...03-16-Crash-because-of-unsigned-in-scev.ll |   2 +-
 .../2015-10-04-Crash-in-domain-generation.ll  |   2 +-
 polly/test/ScopInfo/Alias-0.ll                |   4 +-
 polly/test/ScopInfo/Alias-1.ll                |   4 +-
 polly/test/ScopInfo/Alias-2.ll                |   4 +-
 polly/test/ScopInfo/Alias-3.ll                |   4 +-
 polly/test/ScopInfo/Alias-4.ll                |   4 +-
 .../test/ScopInfo/BoundChecks/single-loop.ll  |   4 +-
 polly/test/ScopInfo/BoundChecks/two-loops.ll  |   4 +-
 polly/test/ScopInfo/NonAffine/div_backedge.ll |   2 +-
 polly/test/ScopInfo/NonAffine/div_domain.ll   |   2 +-
 ...nt_loads_dependent_in_non_affine_region.ll |   2 +-
 .../ScopInfo/NonAffine/modulo_backedge.ll     |   2 +-
 .../test/ScopInfo/NonAffine/modulo_domain.ll  |   2 +-
 ...ffine-loop-condition-dependent-access_1.ll |   4 +-
 ...ffine-loop-condition-dependent-access_2.ll |   6 +-
 ...ffine-loop-condition-dependent-access_3.ll |   6 +-
 .../non_affine_access_with_range_2.ll         |   2 +-
 .../ScopInfo/NonAffine/non_affine_but_sdiv.ll |   2 +-
 .../ScopInfo/NonAffine/non_affine_but_srem.ll |   2 +-
 .../non_affine_conditional_nested.ll          |   2 +-
 ...ine_conditional_surrounding_affine_loop.ll |  11 +-
 ...conditional_surrounding_non_affine_loop.ll |  16 +-
 .../NonAffine/non_affine_float_compare.ll     |   2 +-
 .../NonAffine/non_affine_loop_condition.ll    |   6 +-
 .../NonAffine/non_affine_loop_used_later.ll   |   4 +-
 .../NonAffine/non_affine_parametric_loop.ll   |   2 +-
 .../non_affine_region_guaranteed_non-entry.ll |   2 +-
 ...whole-scop-non-affine-subregion-in-loop.ll |   2 +-
 .../aliasing_conditional_alias_groups_1.ll    |   2 +-
 .../aliasing_conditional_alias_groups_2.ll    |   2 +-
 polly/test/ScopInfo/aliasing_dead_access.ll   |   2 +-
 .../aliasing_many_arrays_to_compare.ll        |   7 +-
 ...iasing_many_parameters_not_all_involved.ll |   4 +-
 .../aliasing_many_read_only_acesses.ll        |   2 +-
 .../aliasing_multiple_alias_groups.ll         |   4 +-
 .../aliasing_with_non_affine_access.ll        |   2 +-
 .../allow-all-parameters-dereferencable.ll    |  11 +-
 polly/test/ScopInfo/assume_gep_bounds.ll      |   4 +-
 polly/test/ScopInfo/assume_gep_bounds_2.ll    |   3 +-
 polly/test/ScopInfo/assume_gep_bounds_many.ll |   3 +-
 .../avoid_new_parameters_from_geps.ll         |   2 +-
 polly/test/ScopInfo/bool-addrec.ll            |   2 +-
 .../test/ScopInfo/bounded_loop_assumptions.ll |   2 +-
 ...ces-loop-scev-with-unknown-iterations-2.ll |   6 +-
 ...ces-loop-scev-with-unknown-iterations-3.ll |   7 +-
 ...ences-loop-scev-with-unknown-iterations.ll |   7 +-
 polly/test/ScopInfo/bug_2010_10_22.ll         |   2 +-
 polly/test/ScopInfo/bug_2011_1_5.ll           |   2 +-
 .../test/ScopInfo/bug_scev_not_fully_eval.ll  |   2 +-
 polly/test/ScopInfo/cfg_consequences.ll       |   2 +-
 .../test/ScopInfo/complex-branch-structure.ll |   3 +-
 polly/test/ScopInfo/complex-condition.ll      |   4 +-
 polly/test/ScopInfo/complex-expression.ll     |   4 +-
 polly/test/ScopInfo/complex-loop-nesting.ll   |   2 +-
 .../ScopInfo/complex-successor-structure-2.ll |   4 +-
 .../ScopInfo/complex-successor-structure-3.ll |   3 +-
 .../ScopInfo/complex-successor-structure.ll   |   4 +-
 .../complex_domain_binary_condition.ll        |   3 +-
 .../ScopInfo/complex_execution_context.ll     |   4 +-
 polly/test/ScopInfo/cond_constant_in_loop.ll  |   2 +-
 polly/test/ScopInfo/cond_in_loop.ll           |   2 +-
 .../ScopInfo/condition-after-error-block-2.ll |   2 +-
 ...condition-after-error-block-before-scop.ll |   2 +-
 .../ScopInfo/condtion-after-error-block.ll    |   2 +-
 polly/test/ScopInfo/const_srem_sdiv.ll        |   3 +-
 .../constant-non-integer-branch-condition.ll  |   2 +-
 .../ScopInfo/constant_factor_in_parameter.ll  |   4 +-
 ...stant_functions_outside_scop_as_unknown.ll |   2 +-
 polly/test/ScopInfo/constant_start_integer.ll |   2 +-
 polly/test/ScopInfo/debug_call.ll             |   2 +-
 .../delinearize-together-all-data-refs.ll     |   2 +-
 polly/test/ScopInfo/div_by_zero.ll            |   2 +-
 .../do-not-model-error-block-accesses.ll      |   2 +-
 .../eager-binary-and-or-conditions.ll         |   4 +-
 .../early_exit_for_complex_domains.ll         |   2 +-
 polly/test/ScopInfo/error-blocks-1.ll         |   2 +-
 polly/test/ScopInfo/error-blocks-2.ll         |   3 +-
 polly/test/ScopInfo/error-blocks-3.ll         |   2 +-
 polly/test/ScopInfo/escaping_empty_scop.ll    |   2 +-
 polly/test/ScopInfo/exit-phi-1.ll             |   4 +-
 polly/test/ScopInfo/exit-phi-2.ll             |   2 +-
 polly/test/ScopInfo/exit_phi_accesses-2.ll    |   2 +-
 polly/test/ScopInfo/exit_phi_accesses.ll      |   2 +-
 .../ScopInfo/expensive-boundary-context.ll    |   3 +-
 ...onstant_factor_introduces_new_parameter.ll |   4 +-
 polly/test/ScopInfo/full-function.ll          |   6 +-
 polly/test/ScopInfo/granularity_same_name.ll  |   8 +-
 .../test/ScopInfo/granularity_scalar-indep.ll |   2 +-
 ...ity_scalar-indep_cross-referencing-phi1.ll |   2 +-
 ...ity_scalar-indep_cross-referencing-phi2.ll |   2 +-
 .../granularity_scalar-indep_epilogue.ll      |   2 +-
 .../granularity_scalar-indep_epilogue_last.ll |   2 +-
 .../granularity_scalar-indep_noepilogue.ll    |   2 +-
 .../granularity_scalar-indep_ordered-2.ll     |   2 +-
 .../granularity_scalar-indep_ordered.ll       |   2 +-
 polly/test/ScopInfo/i1_params.ll              |   2 +-
 polly/test/ScopInfo/infeasible-rtc.ll         |   6 +-
 .../ScopInfo/infeasible_invalid_context.ll    |   6 +-
 polly/test/ScopInfo/int2ptr_ptr2int.ll        |   4 +-
 polly/test/ScopInfo/int2ptr_ptr2int_2.ll      |   6 +-
 polly/test/ScopInfo/integers.ll               |   2 +-
 .../ScopInfo/inter-error-bb-dependence.ll     |   2 +-
 polly/test/ScopInfo/inter_bb_scalar_dep.ll    |   3 +-
 .../intra-non-affine-stmt-phi-node.ll         |   3 +-
 .../ScopInfo/intra_and_inter_bb_scalar_dep.ll |   3 +-
 polly/test/ScopInfo/intra_bb_scalar_dep.ll    |   3 +-
 polly/test/ScopInfo/intrinsics.ll             |   2 +-
 ..._add_rec_after_invariant_load_remapping.ll |   2 +-
 .../invalidate_iterator_during_MA_removal.ll  |   2 +-
 .../test/ScopInfo/invariant-load-instlist.ll  |   2 +-
 ...ariant-loads-leave-read-only-statements.ll |   4 +-
 polly/test/ScopInfo/invariant_load.ll         |   2 +-
 ...load_access_classes_different_base_type.ll |   4 +-
 ...ss_classes_different_base_type_escaping.ll |   4 +-
 ...lasses_different_base_type_same_pointer.ll |   4 +-
 ...fferent_base_type_same_pointer_escaping.ll |   4 +-
 .../ScopInfo/invariant_load_addrec_sum.ll     |   2 +-
 .../ScopInfo/invariant_load_base_pointer.ll   |   2 +-
 ...invariant_load_base_pointer_conditional.ll |   2 +-
 ...ariant_load_base_pointer_in_conditional.ll |   2 +-
 .../invariant_load_branch_condition.ll        |   3 +-
 ...ariant_load_canonicalize_array_baseptrs.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_2.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_3.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_4.ll |   4 +-
 ...ant_load_canonicalize_array_baseptrs_4b.ll |   4 +-
 ...ant_load_canonicalize_array_baseptrs_4c.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_5.ll |   4 +-
 .../invariant_load_complex_condition.ll       |   3 +-
 .../test/ScopInfo/invariant_load_condition.ll |   2 +-
 .../invariant_load_dereferenceable.ll         |   4 +-
 ...iant_load_distinct_parameter_valuations.ll |   2 +-
 .../ScopInfo/invariant_load_in_non_affine.ll  |   3 +-
 polly/test/ScopInfo/invariant_load_loop_ub.ll |   4 +-
 .../invariant_load_ptr_ptr_noalias.ll         |   3 +-
 .../ScopInfo/invariant_load_scalar_dep.ll     |   2 +-
 .../ScopInfo/invariant_load_stmt_domain.ll    |   2 +-
 .../invariant_load_zext_parameter-2.ll        |   4 +-
 .../ScopInfo/invariant_load_zext_parameter.ll |   4 +-
 ...load_zextended_in_own_execution_context.ll |   4 +-
 ...invariant_loads_complicated_dependences.ll |   2 +-
 .../invariant_loads_cyclic_dependences.ll     |   2 +-
 polly/test/ScopInfo/invariant_loop_bounds.ll  |   2 +-
 ...ariant_same_loop_bound_multiple_times-1.ll |   2 +-
 ...ariant_same_loop_bound_multiple_times-2.ll |   2 +-
 polly/test/ScopInfo/isl_aff_out_of_bounds.ll  |   2 +-
 polly/test/ScopInfo/isl_trip_count_01.ll      |   2 +-
 polly/test/ScopInfo/isl_trip_count_02.ll      |   2 +-
 polly/test/ScopInfo/isl_trip_count_03.ll      |   2 +-
 .../isl_trip_count_multiple_exiting_blocks.ll |   2 +-
 polly/test/ScopInfo/licm_load.ll              |  31 +-
 polly/test/ScopInfo/licm_potential_store.ll   |  79 +---
 .../ScopInfo/licm_potential_store_mssa.ll     |  50 +++
 polly/test/ScopInfo/licm_reduction_nested.ll  |   4 +-
 .../long-compile-time-alias-analysis.ll       |   2 +-
 .../long-sequence-of-error-blocks-2.ll        |   2 +-
 .../ScopInfo/long-sequence-of-error-blocks.ll |   3 +-
 .../test/ScopInfo/loop-multiexit-succ-cond.ll |   4 +-
 polly/test/ScopInfo/loop_affine_bound_0.ll    |   4 +-
 polly/test/ScopInfo/loop_affine_bound_1.ll    |   4 +-
 polly/test/ScopInfo/loop_affine_bound_2.ll    |   4 +-
 polly/test/ScopInfo/loop_carry.ll             |   2 +-
 .../test/ScopInfo/many-scalar-dependences.ll  |   2 +-
 polly/test/ScopInfo/max-loop-depth.ll         |   2 +-
 polly/test/ScopInfo/memcpy-raw-source.ll      |   2 +-
 polly/test/ScopInfo/memcpy.ll                 |   4 +-
 polly/test/ScopInfo/memmove.ll                |   4 +-
 polly/test/ScopInfo/memset.ll                 |   4 +-
 polly/test/ScopInfo/memset_null.ll            |   4 +-
 .../ScopInfo/mismatching-array-dimensions.ll  |   2 +-
 .../mod_ref_access_pointee_arguments.ll       |   6 +-
 .../mod_ref_read_pointee_arguments.ll         |   6 +-
 polly/test/ScopInfo/mod_ref_read_pointer.ll   |   4 +-
 polly/test/ScopInfo/mod_ref_read_pointers.ll  |   6 +-
 polly/test/ScopInfo/modulo_zext_1.ll          |   2 +-
 polly/test/ScopInfo/modulo_zext_2.ll          |   2 +-
 polly/test/ScopInfo/modulo_zext_3.ll          |   2 +-
 polly/test/ScopInfo/multi-scop.ll             |   2 +-
 .../ScopInfo/multidim_2d-diagonal-matrix.ll   |   4 +-
 .../multidim_2d_outer_parametric_offset.ll    |   2 +-
 ..._2d_parametric_array_static_loop_bounds.ll |   2 +-
 .../ScopInfo/multidim_2d_with_modref_call.ll  |   8 +-
 .../multidim_2d_with_modref_call_2.ll         |   8 +-
 ..._3d_parametric_array_static_loop_bounds.ll |   2 +-
 ...idim_fixedsize_different_dimensionality.ll |   2 +-
 .../multidim_fixedsize_multi_offset.ll        |   2 +-
 .../ScopInfo/multidim_fold_constant_dim.ll    |   2 +-
 .../multidim_fold_constant_dim_zero.ll        |   2 +-
 polly/test/ScopInfo/multidim_fortran_2d.ll    |   3 +-
 .../ScopInfo/multidim_fortran_2d_params.ll    |   4 +-
 .../multidim_fortran_2d_with_modref_call.ll   |   8 +-
 polly/test/ScopInfo/multidim_fortran_srem.ll  |   2 +-
 .../test/ScopInfo/multidim_gep_pointercast.ll |   2 +-
 .../ScopInfo/multidim_gep_pointercast2.ll     |   2 +-
 .../ScopInfo/multidim_invalid_dimension.ll    |   2 +-
 .../multidim_ivs_and_integer_offsets_3d.ll    |   2 +-
 ...multidim_ivs_and_parameteric_offsets_3d.ll |   2 +-
 .../test/ScopInfo/multidim_many_references.ll |   4 +-
 .../ScopInfo/multidim_nested_start_integer.ll |   4 +-
 .../multidim_nested_start_share_parameter.ll  |   2 +-
 polly/test/ScopInfo/multidim_only_ivs_2d.ll   |   2 +-
 polly/test/ScopInfo/multidim_only_ivs_3d.ll   |   2 +-
 .../ScopInfo/multidim_only_ivs_3d_cast.ll     |   2 +-
 .../ScopInfo/multidim_only_ivs_3d_reverse.ll  |   2 +-
 .../ScopInfo/multidim_param_in_subscript-2.ll |   2 +-
 .../ScopInfo/multidim_param_in_subscript.ll   |   2 +-
 .../multidim_parameter_addrec_product.ll      |   2 +-
 .../multidim_single_and_multidim_array.ll     |  16 +-
 polly/test/ScopInfo/multidim_srem.ll          |   2 +-
 polly/test/ScopInfo/multidim_with_bitcast.ll  |   2 +-
 .../ScopInfo/multiple-binary-or-conditions.ll |   4 +-
 ...ss-offset-not-dividable-by-element-size.ll |   4 +-
 .../ScopInfo/multiple-types-non-affine-2.ll   |   4 +-
 .../ScopInfo/multiple-types-non-affine.ll     |   4 +-
 .../multiple-types-non-power-of-two-2.ll      |   2 +-
 .../multiple-types-non-power-of-two.ll        |   2 +-
 .../multiple-types-two-dimensional-2.ll       |   4 +-
 .../multiple-types-two-dimensional.ll         |   4 +-
 polly/test/ScopInfo/multiple-types.ll         |   3 +-
 .../test/ScopInfo/multiple_exiting_blocks.ll  |   2 +-
 .../multiple_exiting_blocks_two_loop.ll       |   2 +-
 polly/test/ScopInfo/multiple_latch_blocks.ll  |   2 +-
 polly/test/ScopInfo/nested-loops.ll           |   2 +-
 .../no-scalar-deps-in-non-affine-subregion.ll |   2 +-
 polly/test/ScopInfo/non-affine-region-phi.ll  |   4 +-
 .../ScopInfo/non-affine-region-with-loop-2.ll |   2 +-
 .../ScopInfo/non-affine-region-with-loop.ll   |   4 +-
 polly/test/ScopInfo/non-precise-inv-load-1.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-2.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-3.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-4.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-5.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-6.ll |   2 +-
 polly/test/ScopInfo/non-pure-function-call.ll |   2 +-
 ...-pure-function-calls-causes-dead-blocks.ll |   2 +-
 .../test/ScopInfo/non-pure-function-calls.ll  |   2 +-
 polly/test/ScopInfo/non_affine_access.ll      |   4 +-
 polly/test/ScopInfo/non_affine_region_1.ll    |   2 +-
 polly/test/ScopInfo/non_affine_region_2.ll    |   2 +-
 polly/test/ScopInfo/non_affine_region_3.ll    |   4 +-
 polly/test/ScopInfo/non_affine_region_4.ll    |   2 +-
 .../ScopInfo/nonaffine-buildMemoryAccess.ll   |   2 +-
 polly/test/ScopInfo/not-a-reduction.ll        |   2 +-
 polly/test/ScopInfo/opaque-struct.ll          |   2 +-
 ...gion-entry-phi-node-nonaffine-subregion.ll |   2 +-
 ...ut-of-scop-use-in-region-entry-phi-node.ll |   2 +-
 .../ScopInfo/parameter-constant-division.ll   |   4 +-
 .../ScopInfo/parameter_in_dead_statement.ll   |   6 +-
 polly/test/ScopInfo/parameter_product.ll      |   2 +-
 .../parameter_with_constant_factor_in_add.ll  |   2 +-
 .../ScopInfo/partially_invariant_load_1.ll    |   4 +-
 .../ScopInfo/partially_invariant_load_2.ll    |   2 +-
 .../test/ScopInfo/phi-in-non-affine-region.ll |   2 +-
 polly/test/ScopInfo/phi_after_error_block.ll  |   2 +-
 .../test/ScopInfo/phi_condition_modeling_1.ll |   2 +-
 .../test/ScopInfo/phi_condition_modeling_2.ll |   2 +-
 .../test/ScopInfo/phi_conditional_simple_1.ll |   2 +-
 polly/test/ScopInfo/phi_loop_carried_float.ll |   2 +-
 polly/test/ScopInfo/phi_not_grouped_at_top.ll |   2 +-
 polly/test/ScopInfo/phi_scalar_simple_1.ll    |   2 +-
 polly/test/ScopInfo/phi_scalar_simple_2.ll    |   2 +-
 polly/test/ScopInfo/phi_with_invoke_edge.ll   |   2 +-
 .../ScopInfo/pointer-comparison-no-nsw.ll     |   2 +-
 polly/test/ScopInfo/pointer-comparison.ll     |   2 +-
 .../test/ScopInfo/pointer-type-expressions.ll |   2 +-
 ...er-used-as-base-pointer-and-scalar-read.ll |   2 +-
 .../polly-timeout-parameter-bounds.ll         |   2 +-
 polly/test/ScopInfo/pr38218.ll                |   2 +-
 ...eserve-equiv-class-order-in-basic_block.ll |   2 +-
 .../test/ScopInfo/process_added_dimensions.ll |   2 +-
 .../test/ScopInfo/pwaff-complexity-bailout.ll |   2 +-
 polly/test/ScopInfo/ranged_parameter.ll       |   2 +-
 polly/test/ScopInfo/ranged_parameter_2.ll     |   3 +-
 polly/test/ScopInfo/ranged_parameter_wrap.ll  |   2 +-
 .../test/ScopInfo/ranged_parameter_wrap_2.ll  |   2 +-
 .../read-only-scalar-used-in-phi-2.ll         |   2 +-
 .../ScopInfo/read-only-scalar-used-in-phi.ll  |   2 +-
 polly/test/ScopInfo/read-only-scalars.ll      |   4 +-
 polly/test/ScopInfo/read-only-statements.ll   |   2 +-
 .../ScopInfo/reduction_alternating_base.ll    |   2 +-
 ...uction_chain_partially_outside_the_scop.ll |   2 +-
 .../ScopInfo/reduction_different_index.ll     |   2 +-
 .../ScopInfo/reduction_different_index1.ll    |   2 +-
 .../reduction_disabled_multiplicative.ll      |   2 +-
 polly/test/ScopInfo/reduction_double.ll       |   2 +-
 .../reduction_escaping_intermediate.ll        |   2 +-
 .../reduction_escaping_intermediate_2.ll      |   2 +-
 .../reduction_escaping_intermediate_3.ll      |   2 +-
 polly/test/ScopInfo/reduction_if.ll           |   2 +-
 .../ScopInfo/reduction_indirect_access.ll     |   2 +-
 .../ScopInfo/reduction_indirect_access_2.ll   |   2 +-
 .../reduction_invalid_different_operators.ll  |   2 +-
 .../reduction_invalid_overlapping_accesses.ll |   2 +-
 .../reduction_long_reduction_chain.ll         |   2 +-
 ...duction_long_reduction_chain_double_use.ll |   2 +-
 .../reduction_multiple_different_operators.ll |   2 +-
 .../reduction_multiple_loops_array_sum.ll     |   2 +-
 .../reduction_multiple_loops_array_sum_1.ll   |   2 +-
 .../reduction_multiple_simple_binary.ll       |   2 +-
 .../reduction_non_overlapping_chains.ll       |   2 +-
 .../reduction_only_reduction_like_access.ll   |   2 +-
 polly/test/ScopInfo/reduction_simple_fp.ll    |   2 +-
 .../ScopInfo/reduction_simple_w_constant.ll   |   2 +-
 polly/test/ScopInfo/reduction_simple_w_iv.ll  |   2 +-
 .../ScopInfo/reduction_two_identical_reads.ll |   4 +-
 .../redundant_parameter_constraint.ll         |   2 +-
 .../test/ScopInfo/region-with-instructions.ll |   2 +-
 polly/test/ScopInfo/remarks.ll                |   3 +-
 .../required-invariant-loop-bounds.ll         |   3 +-
 .../ScopInfo/restriction_in_dead_block.ll     |   2 +-
 .../run-time-check-many-array-disjuncts.ll    |   5 +-
 .../run-time-check-many-parameters.ll         |   2 +-
 .../run-time-check-many-piecewise-aliasing.ll |   5 +-
 .../run-time-check-read-only-arrays.ll        |   2 +-
 .../same-base-address-scalar-and-array.ll     |   2 +-
 polly/test/ScopInfo/scalar.ll                 |   2 +-
 .../ScopInfo/scalar_dependence_cond_br.ll     |   2 +-
 polly/test/ScopInfo/scalar_to_array.ll        |   4 +-
 .../scev-div-with-evaluatable-divisor.ll      |   2 +-
 polly/test/ScopInfo/scev-invalidated.ll       |   2 +-
 .../schedule-const-post-dominator-walk-2.ll   |   2 +-
 .../schedule-const-post-dominator-walk.ll     |   2 +-
 .../schedule-constuction-endless-loop1.ll     |   2 +-
 .../schedule-constuction-endless-loop2.ll     |   2 +-
 ...tly-contructed-in-case-of-infinite-loop.ll |   2 +-
 .../scop-affine-parameter-ordering.ll         |   2 +-
 polly/test/ScopInfo/sign_wrapped_set.ll       |   2 +-
 polly/test/ScopInfo/simple_loop_1.ll          |   2 +-
 polly/test/ScopInfo/simple_loop_2.ll          |   2 +-
 polly/test/ScopInfo/simple_loop_unsigned.ll   |   2 +-
 polly/test/ScopInfo/simple_loop_unsigned_2.ll |   2 +-
 polly/test/ScopInfo/simple_loop_unsigned_3.ll |   2 +-
 .../ScopInfo/simple_nonaffine_loop_not.ll     |   2 +-
 polly/test/ScopInfo/smax.ll                   |   2 +-
 polly/test/ScopInfo/statistics.ll             |   2 +-
 .../stmt_split_exit_of_region_stmt.ll         |   2 +-
 .../ScopInfo/stmt_split_no_after_split.ll     |   2 +-
 .../test/ScopInfo/stmt_split_no_dependence.ll |   2 +-
 polly/test/ScopInfo/stmt_split_on_store.ll    |   2 +-
 .../ScopInfo/stmt_split_on_synthesizable.ll   |   2 +-
 .../stmt_split_phi_in_beginning_bb.ll         |   2 +-
 polly/test/ScopInfo/stmt_split_phi_in_stmt.ll |   2 +-
 .../ScopInfo/stmt_split_scalar_dependence.ll  |   2 +-
 polly/test/ScopInfo/stmt_split_within_loop.ll |   2 +-
 .../stmt_with_read_but_without_sideffect.ll   |   2 +-
 polly/test/ScopInfo/switch-1.ll               |   4 +-
 polly/test/ScopInfo/switch-2.ll               |   4 +-
 polly/test/ScopInfo/switch-3.ll               |   4 +-
 polly/test/ScopInfo/switch-4.ll               |   4 +-
 polly/test/ScopInfo/switch-5.ll               |   4 +-
 polly/test/ScopInfo/switch-6.ll               |   4 +-
 polly/test/ScopInfo/switch-7.ll               |   4 +-
 polly/test/ScopInfo/tempscop-printing.ll      |   2 +-
 .../ScopInfo/test-wrapping-in-condition.ll    |   4 +-
 polly/test/ScopInfo/truncate-1.ll             |   2 +-
 polly/test/ScopInfo/truncate-2.ll             |   2 +-
 polly/test/ScopInfo/truncate-3.ll             |   3 +-
 polly/test/ScopInfo/two-loops-one-infinite.ll |   2 +-
 .../two-loops-right-after-each-other.ll       |   2 +-
 polly/test/ScopInfo/undef_in_cond.ll          |   2 +-
 polly/test/ScopInfo/unnamed_nonaffine.ll      |   4 +-
 polly/test/ScopInfo/unnamed_stmts.ll          |   2 +-
 .../ScopInfo/unpredictable_nonscop_loop.ll    |   2 +-
 .../test/ScopInfo/unprofitable_scalar-accs.ll |   4 +-
 polly/test/ScopInfo/unsigned-condition.ll     |   2 +-
 polly/test/ScopInfo/unsigned-division-1.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-2.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-3.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-4.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-5.ll    |   2 +-
 polly/test/ScopInfo/unsigned_wrap_uge.ll      |   2 +-
 polly/test/ScopInfo/unsigned_wrap_ugt.ll      |   2 +-
 polly/test/ScopInfo/unsigned_wrap_ule.ll      |   2 +-
 polly/test/ScopInfo/unsigned_wrap_ult.ll      |   2 +-
 polly/test/ScopInfo/user_context.ll           |   8 +-
 ...ed_assumptions-in-bb-signed-conditional.ll |   4 +-
 .../user_provided_assumptions-in-bb-signed.ll |   2 +-
 ...ser_provided_assumptions-in-bb-unsigned.ll |   4 +-
 .../ScopInfo/user_provided_assumptions.ll     |   4 +-
 .../ScopInfo/user_provided_assumptions_2.ll   |   4 +-
 .../ScopInfo/user_provided_assumptions_3.ll   |   4 +-
 ...ser_provided_non_dominating_assumptions.ll |   6 +-
 polly/test/ScopInfo/variant_base_pointer.ll   |   4 +-
 .../ScopInfo/variant_load_empty_domain.ll     |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_0.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_1.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_2.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_3.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_4.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_5.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_6.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_7.ll  |   2 +-
 .../ScopInfo/wraping_signed_expr_slow_1.ll    |   2 +-
 .../ScopInfo/wraping_signed_expr_slow_2.ll    |   2 +-
 polly/test/ScopInfo/zero_ext_of_truncate.ll   |   2 +-
 polly/test/ScopInfo/zero_ext_of_truncate_2.ll |   2 +-
 .../test/ScopInfo/zero_ext_space_mismatch.ll  |   2 +-
 polly/test/ScopInliner/ignore-declares.ll     |   2 +-
 polly/test/ScopInliner/invariant-load-func.ll |   2 +-
 polly/test/ScopInliner/simple-inline-loop.ll  |   2 +-
 polly/test/Simplify/coalesce_3partials.ll     |   2 +-
 .../Simplify/coalesce_disjointelements.ll     |   2 +-
 polly/test/Simplify/coalesce_overlapping.ll   |   2 +-
 polly/test/Simplify/coalesce_partial.ll       |   2 +-
 polly/test/Simplify/dead_access_load.ll       |   2 +-
 polly/test/Simplify/dead_access_phi.ll        |   2 +-
 polly/test/Simplify/dead_access_value.ll      |   2 +-
 polly/test/Simplify/dead_instruction.ll       |   2 +-
 polly/test/Simplify/emptyaccessdomain.ll      |   2 +-
 polly/test/Simplify/exit_phi_accesses-2.ll    |   2 +-
 polly/test/Simplify/func-b320a7.ll            |   2 +-
 polly/test/Simplify/gemm.ll                   |   2 +-
 .../Simplify/nocoalesce_differentvalues.ll    |   2 +-
 .../Simplify/nocoalesce_elementmismatch.ll    |   2 +-
 polly/test/Simplify/nocoalesce_readbetween.ll |   2 +-
 .../test/Simplify/nocoalesce_writebetween.ll  |   2 +-
 polly/test/Simplify/notdead_region_exitphi.ll |   2 +-
 .../test/Simplify/notdead_region_innerphi.ll  |   2 +-
 .../test/Simplify/notredundant_region_loop.ll |   2 +-
 .../Simplify/notredundant_region_middle.ll    |   2 +-
 .../notredundant_synthesizable_unknownit.ll   |   2 +-
 ...ut-of-scop-use-in-region-entry-phi-node.ll |   2 +-
 polly/test/Simplify/overwritten.ll            |   2 +-
 polly/test/Simplify/overwritten_3phi.ll       |   2 +-
 polly/test/Simplify/overwritten_3store.ll     |   2 +-
 .../overwritten_implicit_and_explicit.ll      |   2 +-
 .../test/Simplify/overwritten_loadbetween.ll  |   2 +-
 polly/test/Simplify/overwritten_scalar.ll     |   2 +-
 polly/test/Simplify/pass_existence.ll         |   2 +-
 polly/test/Simplify/phi_in_regionstmt.ll      |   2 +-
 polly/test/Simplify/pr33323.ll                |   2 +-
 polly/test/Simplify/redundant.ll              |   2 +-
 .../test/Simplify/redundant_differentindex.ll |   2 +-
 polly/test/Simplify/redundant_partialwrite.ll |   2 +-
 polly/test/Simplify/redundant_region.ll       |   2 +-
 .../test/Simplify/redundant_region_scalar.ll  |   2 +-
 polly/test/Simplify/redundant_scalarwrite.ll  |   2 +-
 polly/test/Simplify/redundant_storebetween.ll |   2 +-
 polly/test/Simplify/scalability1.ll           |   2 +-
 polly/test/Simplify/scalability2.ll           |   2 +-
 polly/test/Simplify/sweep_mapped_phi.ll       |   2 +-
 polly/test/Simplify/sweep_mapped_value.ll     |   2 +-
 .../Simplify/ununsed_read_in_region_entry.ll  |   4 +-
 polly/test/Support/Plugins.ll                 |   3 +-
 polly/test/Support/exportjson.ll              |  24 +-
 polly/test/Support/isl-args.ll                |   8 +-
 polly/test/Support/pipelineposition.ll        |   8 +-
 polly/test/lit.site.cfg.in                    |   4 -
 polly/test/polly.ll                           |   2 +-
 1143 files changed, 2700 insertions(+), 4460 deletions(-)
 delete mode 100644 polly/include/polly/LinkAllPasses.h
 create mode 100644 polly/include/polly/Pass/PhaseManager.h
 create mode 100644 polly/include/polly/Pass/PollyFunctionPass.h
 create mode 100644 polly/include/polly/Pass/PollyModulePass.h
 create mode 100644 polly/lib/Pass/PhaseManager.cpp
 create mode 100644 polly/lib/Pass/PollyFunctionPass.cpp
 create mode 100644 polly/lib/Pass/PollyModulePass.cpp
 create mode 100644 polly/test/ScopInfo/licm_potential_store_mssa.ll

diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
index f5ea47b69cf02..215a802843304 100644
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -13,3 +13,7 @@ In Polly |version| the following important changes have been incorporated.
 
  * ScopInliner has been updated for the New Pass Manager.
 
+ * Polly now is a monolithic pass split into phases.
+
+ * Polly's support for the legacy pass manager has been removed.
+
diff --git a/polly/include/polly/Canonicalization.h b/polly/include/polly/Canonicalization.h
index 03f277e4e91ba..972b660894a1c 100644
--- a/polly/include/polly/Canonicalization.h
+++ b/polly/include/polly/Canonicalization.h
@@ -11,12 +11,6 @@
 
 #include "llvm/Passes/PassBuilder.h"
 
-namespace llvm {
-namespace legacy {
-class PassManagerBase;
-}
-} // namespace llvm
-
 namespace polly {
 
 /// Schedule a set of canonicalization passes to prepare for Polly.
@@ -26,8 +20,6 @@ namespace polly {
 /// into a canonical form that simplifies the analysis and optimization passes
 /// of Polly. The set of optimization passes scheduled here is probably not yet
 /// optimal. TODO: Optimize the set of canonicalization passes.
-void registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM);
-
 llvm::FunctionPassManager
 buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM,
                                   llvm::OptimizationLevel Level);
diff --git a/polly/include/polly/CodeGen/CodeGeneration.h b/polly/include/polly/CodeGen/CodeGeneration.h
index 57aec1d70cc72..2340fbe016b49 100644
--- a/polly/include/polly/CodeGen/CodeGeneration.h
+++ b/polly/include/polly/CodeGen/CodeGeneration.h
@@ -14,6 +14,7 @@
 #include "llvm/IR/PassManager.h"
 
 namespace polly {
+class IslAstInfo;
 
 enum VectorizerChoice {
   VECTORIZER_NONE,
@@ -33,6 +34,8 @@ struct CodeGenerationPass final : PassInfoMixin<CodeGenerationPass> {
 };
 
 extern bool PerfMonitoring;
+
+bool runCodeGeneration(Scop &S, llvm::RegionInfo &RI, IslAstInfo &AI);
 } // namespace polly
 
 #endif // POLLY_CODEGENERATION_H
diff --git a/polly/include/polly/CodeGen/IslAst.h b/polly/include/polly/CodeGen/IslAst.h
index c99a4957d6b48..3e1ff2c8a24da 100644
--- a/polly/include/polly/CodeGen/IslAst.h
+++ b/polly/include/polly/CodeGen/IslAst.h
@@ -21,6 +21,7 @@
 #ifndef POLLY_ISLAST_H
 #define POLLY_ISLAST_H
 
+#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/PassManager.h"
@@ -172,33 +173,6 @@ struct IslAstAnalysis : AnalysisInfoMixin<IslAstAnalysis> {
                  ScopStandardAnalysisResults &SAR);
 };
 
-class IslAstInfoWrapperPass final : public ScopPass {
-  std::unique_ptr<IslAstInfo> Ast;
-
-public:
-  static char ID;
-
-  IslAstInfoWrapperPass() : ScopPass(ID) {}
-
-  IslAstInfo &getAI() { return *Ast; }
-  const IslAstInfo &getAI() const { return *Ast; }
-
-  /// Build the AST for the given SCoP @p S.
-  bool runOnScop(Scop &S) override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// Release the internal memory.
-  void releaseMemory() override;
-
-  /// Print a source code representation of the program.
-  void printScop(raw_ostream &OS, Scop &S) const override;
-};
-
-llvm::Pass *createIslAstInfoWrapperPassPass();
-llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
-
 struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> {
   IslAstPrinterPass(raw_ostream &OS) : OS(OS) {}
 
@@ -207,11 +181,9 @@ struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> {
 
   raw_ostream &OS;
 };
-} // namespace polly
 
-namespace llvm {
-void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
-void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
+std::unique_ptr<IslAstInfo> runIslAstGen(Scop &S,
+                                         DependenceAnalysis::Result &DA);
+} // namespace polly
 
 #endif // POLLY_ISLAST_H
diff --git a/polly/include/polly/CodePreparation.h b/polly/include/polly/CodePreparation.h
index c6bc526db209d..1a15e3d4d5a29 100644
--- a/polly/include/polly/CodePreparation.h
+++ b/polly/include/polly/CodePreparation.h
@@ -15,6 +15,12 @@
 
 #include "llvm/IR/PassManager.h"
 
+namespace llvm {
+class DominatorTree;
+class LoopInfo;
+class RegionInfo;
+} // namespace llvm
+
 namespace polly {
 struct CodePreparationPass final : llvm::PassInfoMixin<CodePreparationPass> {
   llvm::PreservedAnalyses run(llvm::Function &F,
diff --git a/polly/include/polly/DeLICM.h b/polly/include/polly/DeLICM.h
index 0e03c04079480..63fc509e0bd46 100644
--- a/polly/include/polly/DeLICM.h
+++ b/polly/include/polly/DeLICM.h
@@ -21,15 +21,10 @@
 #include "isl/isl-noexceptions.h"
 
 namespace llvm {
-class PassRegistry;
-class Pass;
 class raw_ostream;
 } // namespace llvm
 
 namespace polly {
-/// Create a new DeLICM pass instance.
-llvm::Pass *createDeLICMWrapperPass();
-llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS);
 
 struct DeLICMPass final : llvm::PassInfoMixin<DeLICMPass> {
   DeLICMPass() {}
@@ -59,11 +54,7 @@ bool isConflicting(isl::union_set ExistingOccupied,
                    isl::union_map ProposedWrites,
                    llvm::raw_ostream *OS = nullptr, unsigned Indent = 0);
 
+bool runDeLICM(Scop &S);
 } // namespace polly
 
-namespace llvm {
-void initializeDeLICMWrapperPassPass(llvm::PassRegistry &);
-void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
 #endif /* POLLY_DELICM_H */
diff --git a/polly/include/polly/DeadCodeElimination.h b/polly/include/polly/DeadCodeElimination.h
index d416afa030c56..4d8da56c76eec 100644
--- a/polly/include/polly/DeadCodeElimination.h
+++ b/polly/include/polly/DeadCodeElimination.h
@@ -13,16 +13,10 @@
 #ifndef POLLY_DEADCODEELIMINATION_H
 #define POLLY_DEADCODEELIMINATION_H
 
+#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 
-namespace llvm {
-class PassRegistry;
-class Pass;
-class raw_ostream;
-} // namespace llvm
-
 namespace polly {
-llvm::Pass *createDeadCodeElimWrapperPass();
 
 struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> {
   DeadCodeElimPass() {}
@@ -31,10 +25,7 @@ struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> {
                               ScopStandardAnalysisResults &SAR, SPMUpdater &U);
 };
 
+bool runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA);
 } // namespace polly
 
-namespace llvm {
-void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
 #endif /* POLLY_DEADCODEELIMINATION_H */
diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h
index d562ad80592f2..88ea468dd5473 100644
--- a/polly/include/polly/DependenceInfo.h
+++ b/polly/include/polly/DependenceInfo.h
@@ -145,7 +145,6 @@ class Dependences final {
   friend struct DependenceAnalysis;
   friend struct DependenceInfoPrinterPass;
   friend class DependenceInfo;
-  friend class DependenceInfoWrapperPass;
 
   /// Destructor that will free internal objects.
   ~Dependences() { releaseMemory(); }
@@ -192,6 +191,8 @@ class Dependences final {
   const AnalysisLevel Level;
 };
 
+extern Dependences::AnalysisLevel OptAnalysisLevel;
+
 struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> {
   static AnalysisKey Key;
   struct Result {
@@ -232,108 +233,7 @@ struct DependenceInfoPrinterPass final
   raw_ostream &OS;
 };
 
-class DependenceInfo final : public ScopPass {
-public:
-  static char ID;
-
-  /// Construct a new DependenceInfo pass.
-  DependenceInfo() : ScopPass(ID) {}
-
-  /// Return the dependence information for the current SCoP.
-  ///
-  /// @param Level The granularity of dependence analysis result.
-  ///
-  /// @return The dependence analysis result
-  ///
-  const Dependences &getDependences(Dependences::AnalysisLevel Level);
-
-  /// Recompute dependences from schedule and memory accesses.
-  const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);
-
-  /// Invalidate the dependence information and recompute it when needed again.
-  /// May be required when the underlying Scop was changed in a way that would
-  /// add new dependencies (e.g. between new statement instances insierted into
-  /// the SCoP) or intentionally breaks existing ones. It is not required when
-  /// updating the schedule that conforms the existing dependencies.
-  void abandonDependences();
-
-  /// Compute the dependence information for the SCoP @p S.
-  bool runOnScop(Scop &S) override;
-
-  /// Print the dependences for the given SCoP to @p OS.
-  void printScop(raw_ostream &OS, Scop &) const override;
-
-  /// Release the internal memory.
-  void releaseMemory() override {
-    for (auto &d : D)
-      d.reset();
-  }
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
-  Scop *S;
-
-  /// Dependences struct for the current SCoP.
-  std::unique_ptr<Dependences> D[Dependences::NumAnalysisLevels];
-};
-
-llvm::Pass *createDependenceInfoPass();
-llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS);
-
-/// Construct a new DependenceInfoWrapper pass.
-class DependenceInfoWrapperPass final : public FunctionPass {
-public:
-  static char ID;
-
-  /// Construct a new DependenceInfoWrapper pass.
-  DependenceInfoWrapperPass() : FunctionPass(ID) {}
-
-  /// Return the dependence information for the given SCoP.
-  ///
-  /// @param S     SCoP object.
-  /// @param Level The granularity of dependence analysis result.
-  ///
-  /// @return The dependence analysis result
-  ///
-  const Dependences &getDependences(Scop *S, Dependences::AnalysisLevel Level);
-
-  /// Recompute dependences from schedule and memory accesses.
-  const Dependences &recomputeDependences(Scop *S,
-                                          Dependences::AnalysisLevel Level);
-
-  /// Compute the dependence information on-the-fly for the function.
-  bool runOnFunction(Function &F) override;
-
-  /// Print the dependences for the current function to @p OS.
-  void print(raw_ostream &OS, const Module *M = nullptr) const override;
-
-  /// Release the internal memory.
-  void releaseMemory() override { ScopToDepsMap.clear(); }
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
-  using ScopToDepsMapTy = DenseMap<Scop *, std::unique_ptr<Dependences>>;
-
-  /// Scop to Dependence map for the current function.
-  ScopToDepsMapTy ScopToDepsMap;
-};
-
-llvm::Pass *createDependenceInfoWrapperPassPass();
-llvm::Pass *
-createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
-
+DependenceAnalysis::Result runDependenceAnalysis(Scop &S);
 } // namespace polly
 
-namespace llvm {
-void initializeDependenceInfoPass(llvm::PassRegistry &);
-void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &);
-void initializeDependenceInfoPrinterLegacyFunctionPassPass(
-    llvm::PassRegistry &);
-} // namespace llvm
-
 #endif
diff --git a/polly/include/polly/FlattenSchedule.h b/polly/include/polly/FlattenSchedule.h
index 3ef3c304243df..154344d2f5c3e 100644
--- a/polly/include/polly/FlattenSchedule.h
+++ b/polly/include/polly/FlattenSchedule.h
@@ -15,20 +15,10 @@
 #ifndef POLLY_FLATTENSCHEDULE_H
 #define POLLY_FLATTENSCHEDULE_H
 
-namespace llvm {
-class PassRegistry;
-class Pass;
-class raw_ostream;
-} // namespace llvm
-
 namespace polly {
-llvm::Pass *createFlattenSchedulePass();
-llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS);
-} // namespace polly
+class Scop;
 
-namespace llvm {
-void initializeFlattenSchedulePass(llvm::PassRegistry &);
-void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
+void runFlattenSchedulePass(Scop &S);
+} // namespace polly
 
 #endif /* POLLY_FLATTENSCHEDULE_H */
diff --git a/polly/include/polly/ForwardOpTree.h b/polly/include/polly/ForwardOpTree.h
index b5da0f513ab78..8b2ece1f08e15 100644
--- a/polly/include/polly/ForwardOpTree.h
+++ b/polly/include/polly/ForwardOpTree.h
@@ -15,13 +15,7 @@
 
 #include "polly/ScopPass.h"
 
-namespace llvm {
-class PassRegistry;
-} // namespace llvm
-
 namespace polly {
-llvm::Pass *createForwardOpTreeWrapperPass();
-llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS);
 
 struct ForwardOpTreePass final : llvm::PassInfoMixin<ForwardOpTreePass> {
   ForwardOpTreePass() {}
@@ -41,11 +35,15 @@ struct ForwardOpTreePrinterPass final
   llvm::raw_ostream &OS;
 };
 
+/// Pass that redirects scalar reads to array elements that are known to contain
+/// the same value.
+///
+/// This reduces the number of scalar accesses and therefore potentially
+/// increases the freedom of the scheduler. In the ideal case, all reads of a
+/// scalar definition are redirected (We currently do not care about removing
+/// the write in this case).  This is also useful for the main DeLICM pass as
+/// there are less scalars to be mapped.
+bool runForwardOpTree(Scop &S);
 } // namespace polly
 
-namespace llvm {
-void initializeForwardOpTreeWrapperPassPass(PassRegistry &);
-void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &);
-} // namespace llvm
-
 #endif // POLLY_FORWARDOPTREE_H
diff --git a/polly/include/polly/JSONExporter.h b/polly/include/polly/JSONExporter.h
index 958f95ea11404..82a881c737064 100644
--- a/polly/include/polly/JSONExporter.h
+++ b/polly/include/polly/JSONExporter.h
@@ -9,13 +9,11 @@
 #ifndef POLLY_JSONEXPORTER_H
 #define POLLY_JSONEXPORTER_H
 
+#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 #include "llvm/IR/PassManager.h"
 
 namespace polly {
-llvm::Pass *createJSONExporterPass();
-llvm::Pass *createJSONImporterPass();
-llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS);
 
 /// This pass exports a scop to a jscop file. The filename is generated from the
 /// concatenation of the function and scop name.
@@ -30,12 +28,9 @@ struct JSONImportPass final : llvm::PassInfoMixin<JSONExportPass> {
   llvm::PreservedAnalyses run(Scop &, ScopAnalysisManager &,
                               ScopStandardAnalysisResults &, SPMUpdater &);
 };
-} // namespace polly
 
-namespace llvm {
-void initializeJSONExporterPass(llvm::PassRegistry &);
-void initializeJSONImporterPass(llvm::PassRegistry &);
-void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
+void runImportJSON(Scop &S, DependenceAnalysis::Result &DA);
+void runExportJSON(Scop &S);
+} // namespace polly
 
 #endif /* POLLY_JSONEXPORTER_H */
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
deleted file mode 100644
index 9978344c73e9f..0000000000000
--- a/polly/include/polly/LinkAllPasses.h
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- polly/LinkAllPasses.h ----------- Reference All Passes ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This header file pulls in all transformation and analysis passes for tools
-// like opt and bugpoint that need this functionality.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_LINKALLPASSES_H
-#define POLLY_LINKALLPASSES_H
-
-#include "polly/Config/config.h"
-#include "polly/Support/DumpFunctionPass.h"
-#include "polly/Support/DumpModulePass.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/AlwaysTrue.h"
-
-namespace llvm {
-class Pass;
-class PassRegistry;
-} // namespace llvm
-
-namespace polly {
-llvm::Pass *createCodePreparationPass();
-llvm::Pass *createScopInlinerPass();
-llvm::Pass *createDeadCodeElimWrapperPass();
-llvm::Pass *createDependenceInfoPass();
-llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createDependenceInfoWrapperPassPass();
-llvm::Pass *
-createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
-llvm::Pass *createDOTOnlyPrinterWrapperPass();
-llvm::Pass *createDOTOnlyViewerWrapperPass();
-llvm::Pass *createDOTPrinterWrapperPass();
-llvm::Pass *createDOTViewerWrapperPass();
-llvm::Pass *createJSONExporterPass();
-llvm::Pass *createJSONImporterPass();
-llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createPollyCanonicalizePass();
-llvm::Pass *createScopDetectionWrapperPassPass();
-llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createScopInfoRegionPassPass();
-llvm::Pass *createScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS);
-llvm::Pass *createScopInfoWrapperPassPass();
-llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
-llvm::Pass *createIslAstInfoWrapperPassPass();
-llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createCodeGenerationPass();
-llvm::Pass *createIslScheduleOptimizerWrapperPass();
-llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createFlattenSchedulePass();
-llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createForwardOpTreeWrapperPass();
-llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createDeLICMWrapperPass();
-llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createMaximalStaticExpansionPass();
-llvm::Pass *createSimplifyWrapperPass(int);
-llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS);
-llvm::Pass *createPruneUnprofitableWrapperPass();
-
-extern char &CodePreparationID;
-} // namespace polly
-
-namespace {
-struct PollyForcePassLinking {
-  PollyForcePassLinking() {
-    // We must reference the passes in such a way that compilers will not delete
-    // it all as dead code, even with whole program optimization, yet is
-    // effectively a NO-OP.
-    if (llvm::getNonFoldableAlwaysTrue())
-      return;
-
-    polly::createCodePreparationPass();
-    polly::createDeadCodeElimWrapperPass();
-    polly::createDependenceInfoPass();
-    polly::createDependenceInfoPrinterLegacyPass(llvm::outs());
-    polly::createDependenceInfoWrapperPassPass();
-    polly::createDependenceInfoPrinterLegacyFunctionPass(llvm::outs());
-    polly::createDOTOnlyPrinterWrapperPass();
-    polly::createDOTOnlyViewerWrapperPass();
-    polly::createDOTPrinterWrapperPass();
-    polly::createDOTViewerWrapperPass();
-    polly::createJSONExporterPass();
-    polly::createJSONImporterPass();
-    polly::createJSONImporterPrinterLegacyPass(llvm::outs());
-    polly::createScopDetectionWrapperPassPass();
-    polly::createScopDetectionPrinterLegacyPass(llvm::outs());
-    polly::createScopInfoRegionPassPass();
-    polly::createScopInfoPrinterLegacyRegionPass(llvm::outs());
-    polly::createScopInfoWrapperPassPass();
-    polly::createScopInfoPrinterLegacyFunctionPass(llvm::outs());
-    polly::createPollyCanonicalizePass();
-    polly::createIslAstInfoWrapperPassPass();
-    polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
-    polly::createCodeGenerationPass();
-    polly::createIslScheduleOptimizerWrapperPass();
-    polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs());
-    polly::createMaximalStaticExpansionPass();
-    polly::createFlattenSchedulePass();
-    polly::createFlattenSchedulePrinterLegacyPass(llvm::errs());
-    polly::createForwardOpTreeWrapperPass();
-    polly::createForwardOpTreePrinterLegacyPass(llvm::errs());
-    polly::createDeLICMWrapperPass();
-    polly::createDeLICMPrinterLegacyPass(llvm::outs());
-    polly::createDumpModuleWrapperPass("", true);
-    polly::createDumpFunctionWrapperPass("");
-    polly::createSimplifyWrapperPass(0);
-    polly::createSimplifyPrinterLegacyPass(llvm::outs());
-    polly::createPruneUnprofitableWrapperPass();
-  }
-} PollyForcePassLinking; // Force link by creating a global definition.
-} // namespace
-
-namespace llvm {
-void initializeCodePreparationPass(llvm::PassRegistry &);
-void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
-void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &);
-void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeScopInfoRegionPassPass(PassRegistry &);
-void initializeScopInfoPrinterLegacyRegionPassPass(llvm::PassRegistry &);
-void initializeScopInfoWrapperPassPass(PassRegistry &);
-void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &);
-void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &);
-void initializeJSONExporterPass(llvm::PassRegistry &);
-void initializeJSONImporterPass(llvm::PassRegistry &);
-void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeDependenceInfoPass(llvm::PassRegistry &);
-void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &);
-void initializeDependenceInfoPrinterLegacyFunctionPassPass(
-    llvm::PassRegistry &);
-void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
-void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeCodeGenerationPass(llvm::PassRegistry &);
-void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
-void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &);
-void initializePollyCanonicalizePass(llvm::PassRegistry &);
-void initializeFlattenSchedulePass(llvm::PassRegistry &);
-void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeForwardOpTreeWrapperPassPass(llvm::PassRegistry &);
-void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &);
-void initializeDeLICMWrapperPassPass(llvm::PassRegistry &);
-void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializeSimplifyWrapperPassPass(llvm::PassRegistry &);
-void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &);
-void initializePruneUnprofitableWrapperPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
-#endif
diff --git a/polly/include/polly/MaximalStaticExpansion.h b/polly/include/polly/MaximalStaticExpansion.h
index 88827b2700887..1f9fbcb1d6a70 100644
--- a/polly/include/polly/MaximalStaticExpansion.h
+++ b/polly/include/polly/MaximalStaticExpansion.h
@@ -14,6 +14,7 @@
 #ifndef POLLY_MAXIMALSTATICEXPANSION_H
 #define POLLY_MAXIMALSTATICEXPANSION_H
 
+#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 #include "llvm/IR/PassManager.h"
 
@@ -37,6 +38,7 @@ struct MaximalStaticExpansionPrinterPass
   llvm::raw_ostream &OS;
 };
 
+void runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI);
 } // namespace polly
 
 #endif /* POLLY_MAXIMALSTATICEXPANSION_H */
diff --git a/polly/include/polly/Pass/PhaseManager.h b/polly/include/polly/Pass/PhaseManager.h
new file mode 100644
index 0000000000000..9ff9bbf02d71f
--- /dev/null
+++ b/polly/include/polly/Pass/PhaseManager.h
@@ -0,0 +1,127 @@
+//===------ PhaseManager.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the sequence of operations on SCoPs, called phases. It is itelf
+// not a pass in either pass manager, but used from PollyFunctionPass or
+// PollyModulePass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_PASS_PHASEMANAGER_H_
+#define POLLY_PASS_PHASEMANAGER_H_
+
+#include "polly/DependenceInfo.h"
+#include "llvm/ADT/Bitset.h"
+#include <stddef.h>
+
+namespace llvm {
+class Function;
+class Error;
+} // namespace llvm
+
+namespace polly {
+
+/// Phases (in execution order) within the Polly pass.
+enum class PassPhase {
+  None,
+
+  Prepare,
+
+  Detection,
+  PrintDetect,
+  DotScops,
+  DotScopsOnly,
+  ViewScops,
+  ViewScopsOnly,
+
+  ScopInfo,
+  PrintScopInfo,
+
+  Flatten,
+
+  Dependences,
+  PrintDependences,
+
+  ImportJScop,
+  Simplify0,
+  Optree,
+  DeLICM,
+  Simplify1,
+  DeadCodeElimination,
+  MaximumStaticExtension,
+  PruneUnprofitable,
+  Optimization,
+  ExportJScop,
+  AstGen,
+  CodeGen,
+
+  PassPhaseFirst = Prepare,
+  PassPhaseLast = CodeGen
+};
+
+StringRef getPhaseName(PassPhase Phase);
+PassPhase parsePhase(StringRef Name);
+bool dependsOnDependenceInfo(PassPhase Phase);
+
+/// Options for the Polly pass.
+class PollyPassOptions {
+  /// For each Polly phase, whether it should be executed.
+  /// Since PassPhase::None is unused, bit positions are shifted by one.
+  llvm::Bitset<static_cast<size_t>(PassPhase::PassPhaseLast) -
+               static_cast<size_t>(PassPhase::PassPhaseFirst) + 1>
+      PhaseEnabled;
+
+public:
+  bool ViewAll = false;
+  std::string ViewFilter;
+  Dependences::AnalysisLevel PrintDepsAnalysisLevel = Dependences::AL_Statement;
+
+  bool isPhaseEnabled(PassPhase Phase) const {
+    assert(Phase != PassPhase::None);
+    unsigned BitPos = static_cast<size_t>(Phase) -
+                      static_cast<size_t>(PassPhase::PassPhaseFirst);
+    return PhaseEnabled[BitPos];
+  }
+
+  void setPhaseEnabled(PassPhase Phase, bool Enabled = true) {
+    assert(Phase != PassPhase::None);
+    unsigned BitPos = static_cast<size_t>(Phase) -
+                      static_cast<size_t>(PassPhase::PassPhaseFirst);
+    if (Enabled)
+      PhaseEnabled.set(BitPos);
+    else
+      PhaseEnabled.reset(BitPos);
+  }
+
+  /// Enable all phases that are necessary for a roundtrip from LLVM-IR back to
+  /// LLVM-IR.
+  void enableEnd2End();
+
+  /// Enabled the default optimization phases.
+  void enableDefaultOpts();
+
+  /// Disable all phases following \p Phase.
+  /// Useful when regression testing that particular phase and everything after
+  /// it is not of interest.
+  void disableAfter(PassPhase Phase);
+
+  /// Check whether the options are coherent relative to each other.
+  llvm::Error checkConsistency() const;
+};
+
+/// Run Polly and its phases on \p F.
+bool runPollyPass(Function &F, llvm::FunctionAnalysisManager &FAM,
+                  PollyPassOptions Opts);
+} // namespace polly
+
+/// Make llvm::enum_seq<PassPhase> work.
+template <> struct llvm::enum_iteration_traits<polly::PassPhase> {
+  static constexpr bool is_iterable = true;
+};
+
+#endif /* POLLY_PASS_PHASEMANAGER_H_ */
diff --git a/polly/include/polly/Pass/PollyFunctionPass.h b/polly/include/polly/Pass/PollyFunctionPass.h
new file mode 100644
index 0000000000000..dd0d4e77d7a80
--- /dev/null
+++ b/polly/include/polly/Pass/PollyFunctionPass.h
@@ -0,0 +1,32 @@
+//===------ PollyFunctionPass.h - Polly function pass ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_PASS_POLLYFUNCTIONPASS_H_
+#define POLLY_PASS_POLLYFUNCTIONPASS_H_
+
+#include "polly/Pass/PhaseManager.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/PassManager.h"
+#include <utility>
+
+namespace polly {
+
+class PollyFunctionPass : public llvm::PassInfoMixin<PollyFunctionPass> {
+public:
+  PollyFunctionPass() {}
+  PollyFunctionPass(PollyPassOptions Opts) : Opts(std::move(Opts)) {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &);
+
+private:
+  PollyPassOptions Opts;
+};
+} // namespace polly
+
+#endif /* POLLY_PASS_POLLYFUNCTIONPASS_H_ */
diff --git a/polly/include/polly/Pass/PollyModulePass.h b/polly/include/polly/Pass/PollyModulePass.h
new file mode 100644
index 0000000000000..2214bbf3d143e
--- /dev/null
+++ b/polly/include/polly/Pass/PollyModulePass.h
@@ -0,0 +1,30 @@
+//===------ PollyModulePass.h - Polly module pass -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_PASS_POLLYMODULEPASS_H_
+#define POLLY_PASS_POLLYMODULEPASS_H_
+
+#include "polly/Pass/PhaseManager.h"
+#include "llvm/IR/PassManager.h"
+
+namespace polly {
+
+class PollyModulePass : public llvm::PassInfoMixin<PollyModulePass> {
+public:
+  PollyModulePass() {}
+  PollyModulePass(PollyPassOptions Opts) : Opts(std::move(Opts)) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &);
+
+private:
+  PollyPassOptions Opts;
+};
+
+} // namespace polly
+
+#endif /* POLLY_PASS_POLLYMODULEPASS_H_ */
diff --git a/polly/include/polly/PruneUnprofitable.h b/polly/include/polly/PruneUnprofitable.h
index 2d285cce69ad4..16b76cc62f1d2 100644
--- a/polly/include/polly/PruneUnprofitable.h
+++ b/polly/include/polly/PruneUnprofitable.h
@@ -15,13 +15,7 @@
 
 #include "polly/ScopPass.h"
 
-namespace llvm {
-class Pass;
-class PassRegistry;
-} // namespace llvm
-
 namespace polly {
-llvm::Pass *createPruneUnprofitableWrapperPass();
 
 struct PruneUnprofitablePass final
     : llvm::PassInfoMixin<PruneUnprofitablePass> {
@@ -30,10 +24,8 @@ struct PruneUnprofitablePass final
   llvm::PreservedAnalyses run(Scop &S, ScopAnalysisManager &SAM,
                               ScopStandardAnalysisResults &SAR, SPMUpdater &U);
 };
-} // namespace polly
 
-namespace llvm {
-void initializePruneUnprofitableWrapperPassPass(PassRegistry &);
-}
+bool runPruneUnprofitable(Scop &S);
+} // namespace polly
 
 #endif // POLLY_PRUNEUNPROFITABLE_H
diff --git a/polly/include/polly/RegisterPasses.h b/polly/include/polly/RegisterPasses.h
index 3a81e1ba7487d..7819462cb0c36 100644
--- a/polly/include/polly/RegisterPasses.h
+++ b/polly/include/polly/RegisterPasses.h
@@ -14,7 +14,6 @@
 #define POLLY_REGISTER_PASSES_H
 
 namespace llvm {
-class PassRegistry;
 class PassBuilder;
 struct PassPluginLibraryInfo;
 namespace legacy {
@@ -23,7 +22,6 @@ class PassManagerBase;
 } // namespace llvm
 
 namespace polly {
-void initializePollyPasses(llvm::PassRegistry &Registry);
 void registerPollyPasses(llvm::PassBuilder &PB);
 } // namespace polly
 
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h
index 3e17eeff49ae3..ac45572ba7ed5 100644
--- a/polly/include/polly/ScheduleOptimizer.h
+++ b/polly/include/polly/ScheduleOptimizer.h
@@ -9,16 +9,10 @@
 #ifndef POLLY_SCHEDULEOPTIMIZER_H
 #define POLLY_SCHEDULEOPTIMIZER_H
 
+#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 
-namespace llvm {
-class Pass;
-class PassRegistry;
-} // namespace llvm
-
 namespace polly {
-llvm::Pass *createIslScheduleOptimizerWrapperPass();
-llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
 
 struct IslScheduleOptimizerPass final
     : llvm::PassInfoMixin<IslScheduleOptimizerPass> {
@@ -38,11 +32,9 @@ struct IslScheduleOptimizerPrinterPass final
 private:
   llvm::raw_ostream &OS;
 };
-} // namespace polly
 
-namespace llvm {
-void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
-void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
+void runIslScheduleOptimizer(Scop &S, llvm::TargetTransformInfo *TTI,
+                             DependenceAnalysis::Result &Deps);
+} // namespace polly
 
 #endif // POLLY_SCHEDULEOPTIMIZER_H
diff --git a/polly/include/polly/ScopDetection.h b/polly/include/polly/ScopDetection.h
index 5759f75463284..ded1c88206430 100644
--- a/polly/include/polly/ScopDetection.h
+++ b/polly/include/polly/ScopDetection.h
@@ -52,7 +52,6 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Pass.h"
 #include <set>
 
 namespace polly {
@@ -68,7 +67,6 @@ using llvm::DenseMap;
 using llvm::DominatorTree;
 using llvm::Function;
 using llvm::FunctionAnalysisManager;
-using llvm::FunctionPass;
 using llvm::IntrinsicInst;
 using llvm::LoopInfo;
 using llvm::Module;
@@ -631,31 +629,6 @@ struct ScopAnalysisPrinterPass final : PassInfoMixin<ScopAnalysisPrinterPass> {
 
   raw_ostream &OS;
 };
-
-class ScopDetectionWrapperPass final : public FunctionPass {
-  std::unique_ptr<ScopDetection> Result;
-
-public:
-  ScopDetectionWrapperPass();
-
-  /// @name FunctionPass interface
-  ///@{
-  static char ID;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
-  bool runOnFunction(Function &F) override;
-  void print(raw_ostream &OS, const Module *M = nullptr) const override;
-  ///@}
-
-  ScopDetection &getSD() const { return *Result; }
-};
-
-llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS);
 } // namespace polly
 
-namespace llvm {
-void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &);
-void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
 #endif // POLLY_SCOPDETECTION_H
diff --git a/polly/include/polly/ScopGraphPrinter.h b/polly/include/polly/ScopGraphPrinter.h
index b57732ad3d70d..c4e669f0c3503 100644
--- a/polly/include/polly/ScopGraphPrinter.h
+++ b/polly/include/polly/ScopGraphPrinter.h
@@ -70,6 +70,9 @@ struct DOTGraphTraits<polly::ScopDetection *> : DOTGraphTraits<RegionNode *> {
 
 namespace polly {
 
+extern std::string ViewFilter;
+extern bool ViewAll;
+
 struct ScopViewer final : llvm::DOTGraphTraitsViewer<ScopAnalysis, false> {
   ScopViewer() : llvm::DOTGraphTraitsViewer<ScopAnalysis, false>("scops") {}
 
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index f700144165d53..7541ddc21e39f 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -23,13 +23,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
 #include "isl/isl-noexceptions.h"
 #include <cassert>
 #include <cstddef>
@@ -55,8 +53,6 @@ using llvm::MemIntrinsic;
 using llvm::PassInfoMixin;
 using llvm::PHINode;
 using llvm::RegionNode;
-using llvm::RegionPass;
-using llvm::RGPassManager;
 using llvm::SetVector;
 using llvm::SmallPtrSetImpl;
 using llvm::SmallVector;
@@ -2674,39 +2670,6 @@ class Scop final {
 /// Print Scop scop to raw_ostream OS.
 raw_ostream &operator<<(raw_ostream &OS, const Scop &scop);
 
-/// The legacy pass manager's analysis pass to compute scop information
-///        for a region.
-class ScopInfoRegionPass final : public RegionPass {
-  /// The Scop pointer which is used to construct a Scop.
-  std::unique_ptr<Scop> S;
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-
-  ScopInfoRegionPass() : RegionPass(ID) {}
-  ~ScopInfoRegionPass() override = default;
-
-  /// Build Scop object, the Polly IR of static control
-  ///        part for the current SESE-Region.
-  ///
-  /// @return If the current region is a valid for a static control part,
-  ///         return the Polly IR representing this static control part,
-  ///         return null otherwise.
-  Scop *getScop() { return S.get(); }
-  const Scop *getScop() const { return S.get(); }
-
-  /// Calculate the polyhedral scop information for a given Region.
-  bool runOnRegion(Region *R, RGPassManager &RGM) override;
-
-  void releaseMemory() override { S.reset(); }
-
-  void print(raw_ostream &O, const Module *M = nullptr) const override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-llvm::Pass *createScopInfoPrinterLegacyRegionPass(raw_ostream &OS);
-
 class ScopInfo {
 public:
   using RegionToScopMapTy = MapVector<Region *, std::unique_ptr<Scop>>;
@@ -2781,45 +2744,6 @@ struct ScopInfoPrinterPass final : PassInfoMixin<ScopInfoPrinterPass> {
 
   raw_ostream &Stream;
 };
-
-//===----------------------------------------------------------------------===//
-/// The legacy pass manager's analysis pass to compute scop information
-///        for the whole function.
-///
-/// This pass will maintain a map of the maximal region within a scop to its
-/// scop object for all the feasible scops present in a function.
-/// This pass is an alternative to the ScopInfoRegionPass in order to avoid a
-/// region pass manager.
-class ScopInfoWrapperPass final : public FunctionPass {
-  std::unique_ptr<ScopInfo> Result;
-
-public:
-  ScopInfoWrapperPass() : FunctionPass(ID) {}
-  ~ScopInfoWrapperPass() override = default;
-
-  static char ID; // Pass identification, replacement for typeid
-
-  ScopInfo *getSI() { return Result.get(); }
-  const ScopInfo *getSI() const { return Result.get(); }
-
-  /// Calculate all the polyhedral scops for a given function.
-  bool runOnFunction(Function &F) override;
-
-  void releaseMemory() override { Result.reset(); }
-
-  void print(raw_ostream &O, const Module *M = nullptr) const override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
 } // end namespace polly
 
-namespace llvm {
-void initializeScopInfoRegionPassPass(PassRegistry &);
-void initializeScopInfoPrinterLegacyRegionPassPass(PassRegistry &);
-void initializeScopInfoWrapperPassPass(PassRegistry &);
-void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &);
-} // end namespace llvm
-
 #endif // POLLY_SCOPINFO_H
diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h
index 014667804330f..ae1938f03ac70 100644
--- a/polly/include/polly/ScopInliner.h
+++ b/polly/include/polly/ScopInliner.h
@@ -23,12 +23,6 @@ class ScopInlinerPass : public llvm::PassInfoMixin<ScopInlinerPass> {
                               llvm::LazyCallGraph &CG,
                               llvm::CGSCCUpdateResult &UR);
 };
-
-llvm::Pass *createScopInlinerWrapperPass();
 } // namespace polly
 
-namespace llvm {
-void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
-}
-
 #endif /* POLLY_POLLYINLINER_H */
diff --git a/polly/include/polly/ScopPass.h b/polly/include/polly/ScopPass.h
index 144cfd1364393..80ccd5717f96c 100644
--- a/polly/include/polly/ScopPass.h
+++ b/polly/include/polly/ScopPass.h
@@ -19,7 +19,6 @@
 
 #include "polly/ScopInfo.h"
 #include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/Analysis/RegionPass.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassManagerImpl.h"
@@ -155,33 +154,6 @@ using ScopPassManager =
     PassManager<Scop, ScopAnalysisManager, ScopStandardAnalysisResults &,
                 SPMUpdater &>;
 
-/// ScopPass - This class adapts the RegionPass interface to allow convenient
-/// creation of passes that operate on the Polly IR. Instead of overriding
-/// runOnRegion, subclasses override runOnScop.
-class ScopPass : public RegionPass {
-  Scop *S;
-
-protected:
-  explicit ScopPass(char &ID) : RegionPass(ID), S(nullptr) {}
-
-  /// runOnScop - This method must be overloaded to perform the
-  /// desired Polyhedral transformation or analysis.
-  ///
-  virtual bool runOnScop(Scop &S) = 0;
-
-  /// Print method for SCoPs.
-  virtual void printScop(raw_ostream &OS, Scop &S) const {}
-
-  /// getAnalysisUsage - Subclasses that override getAnalysisUsage
-  /// must call this.
-  ///
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
-  bool runOnRegion(Region *R, RGPassManager &RGM) override;
-  void print(raw_ostream &OS, const Module *) const override;
-};
-
 struct ScopStandardAnalysisResults {
   DominatorTree &DT;
   ScopInfo &SI;
diff --git a/polly/include/polly/Simplify.h b/polly/include/polly/Simplify.h
index b2aa58d850fae..4565eb26edaf0 100644
--- a/polly/include/polly/Simplify.h
+++ b/polly/include/polly/Simplify.h
@@ -16,11 +16,6 @@
 #include "polly/ScopPass.h"
 #include "llvm/ADT/SmallVector.h"
 
-namespace llvm {
-class PassRegistry;
-class Pass;
-} // namespace llvm
-
 namespace polly {
 class MemoryAccess;
 class ScopStmt;
@@ -41,17 +36,6 @@ class ScopStmt;
 ///   undefined.
 llvm::SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt);
 
-/// Create a Simplify pass
-///
-/// @param CallNo Disambiguates this instance for when there are multiple
-///               instances of this pass in the pass manager. It is used only to
-///               keep the statistics apart and has no influence on the
-///               simplification itself.
-///
-/// @return The Simplify pass.
-llvm::Pass *createSimplifyWrapperPass(int CallNo = 0);
-llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS);
-
 struct SimplifyPass final : PassInfoMixin<SimplifyPass> {
   SimplifyPass(int CallNo = 0) : CallNo(CallNo) {}
 
@@ -73,11 +57,8 @@ struct SimplifyPrinterPass final : PassInfoMixin<SimplifyPrinterPass> {
   raw_ostream &OS;
   int CallNo;
 };
-} // namespace polly
 
-namespace llvm {
-void initializeSimplifyWrapperPassPass(llvm::PassRegistry &);
-void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &);
-} // namespace llvm
+bool runSimplify(Scop &S, int CallNo);
+} // namespace polly
 
 #endif /* POLLY_TRANSFORM_SIMPLIFY_H */
diff --git a/polly/include/polly/Support/DumpFunctionPass.h b/polly/include/polly/Support/DumpFunctionPass.h
index e5c16203adb8f..af04912ed4fe2 100644
--- a/polly/include/polly/Support/DumpFunctionPass.h
+++ b/polly/include/polly/Support/DumpFunctionPass.h
@@ -16,13 +16,7 @@
 #include "llvm/IR/PassManager.h"
 #include <string>
 
-namespace llvm {
-class FunctionPass;
-class ModulePass;
-} // namespace llvm
-
 namespace polly {
-llvm::FunctionPass *createDumpFunctionWrapperPass(std::string Suffix);
 
 /// A pass that isolates a function into a new Module and writes it into a file.
 struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> {
@@ -33,12 +27,6 @@ struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> {
   llvm::PreservedAnalyses run(llvm::Function &F,
                               llvm::FunctionAnalysisManager &AM);
 };
-
 } // namespace polly
 
-namespace llvm {
-class PassRegistry;
-void initializeDumpFunctionWrapperPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
 #endif /* POLLY_SUPPORT_DUMPFUNCTIONPASS_H */
diff --git a/polly/include/polly/Support/DumpModulePass.h b/polly/include/polly/Support/DumpModulePass.h
index c90bbc2484310..6d393a174b19b 100644
--- a/polly/include/polly/Support/DumpModulePass.h
+++ b/polly/include/polly/Support/DumpModulePass.h
@@ -16,12 +16,8 @@
 #include "llvm/IR/PassManager.h"
 #include <string>
 
-namespace llvm {
-class ModulePass;
-} // namespace llvm
-
 namespace polly {
-/// Create a pass that prints the module into a file.
+/// A pass that prints the module into a file.
 ///
 /// The meaning of @p Filename depends on @p IsSuffix. If IsSuffix==false, then
 /// the module is written to the @p Filename. If it is true, the filename is
@@ -30,10 +26,6 @@ namespace polly {
 /// The intent of IsSuffix is to avoid the file being overwritten when
 /// processing multiple modules and/or with multiple dump passes in the
 /// pipeline.
-llvm::ModulePass *createDumpModuleWrapperPass(std::string Filename,
-                                              bool IsSuffix);
-
-/// A pass that prints the module into a file.
 struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> {
   std::string Filename;
   bool IsSuffix;
@@ -46,9 +38,4 @@ struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> {
 
 } // namespace polly
 
-namespace llvm {
-class PassRegistry;
-void initializeDumpModuleWrapperPassPass(llvm::PassRegistry &);
-} // namespace llvm
-
 #endif /* POLLY_SUPPORT_DUMPMODULEPASS_H */
diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h
index 75891525ff7b3..38b731a9f7d8d 100644
--- a/polly/include/polly/Support/ScopHelper.h
+++ b/polly/include/polly/Support/ScopHelper.h
@@ -358,14 +358,6 @@ namespace polly {
 void simplifyRegion(llvm::Region *R, llvm::DominatorTree *DT,
                     llvm::LoopInfo *LI, llvm::RegionInfo *RI);
 
-/// Split the entry block of a function to store the newly inserted
-///        allocations outside of all Scops.
-///
-/// @param EntryBlock The entry block of the current function.
-/// @param P          The pass that currently running.
-///
-void splitEntryBlockForAlloca(llvm::BasicBlock *EntryBlock, llvm::Pass *P);
-
 /// Split the entry block of a function to store the newly inserted
 ///        allocations outside of all Scops.
 ///
diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp
index c620f40ad0724..5183fc5725ece 100644
--- a/polly/lib/Analysis/DependenceInfo.cpp
+++ b/polly/lib/Analysis/DependenceInfo.cpp
@@ -20,7 +20,6 @@
 //===----------------------------------------------------------------------===//
 //
 #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/Support/GICHelper.h"
@@ -42,6 +41,10 @@ using namespace llvm;
 #include "polly/Support/PollyDebug.h"
 #define DEBUG_TYPE "polly-dependence"
 
+namespace polly {
+Dependences::AnalysisLevel OptAnalysisLevel;
+}
+
 static cl::opt<int> OptComputeOut(
     "polly-dependences-computeout",
     cl::desc("Bound the dependence analysis by a maximal amount of "
@@ -69,9 +72,10 @@ static cl::opt<enum AnalysisType> OptAnalysisType(
                           "Overapproximation of dependences")),
     cl::Hidden, cl::init(VALUE_BASED_ANALYSIS), cl::cat(PollyCategory));
 
-static cl::opt<Dependences::AnalysisLevel> OptAnalysisLevel(
+static cl::opt<Dependences::AnalysisLevel, true> XOptAnalysisLevel(
     "polly-dependences-analysis-level",
     cl::desc("The level of dependence analysis"),
+    cl::location(OptAnalysisLevel),
     cl::values(clEnumValN(Dependences::AL_Statement, "statement-wise",
                           "Statement-level analysis"),
                clEnumValN(Dependences::AL_Reference, "reference-wise",
@@ -881,213 +885,7 @@ DependenceInfoPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-const Dependences &
-DependenceInfo::getDependences(Dependences::AnalysisLevel Level) {
-  if (Dependences *d = D[Level].get())
-    return *d;
-
-  return recomputeDependences(Level);
-}
-
-const Dependences &
-DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) {
-  D[Level].reset(new Dependences(S->getSharedIslCtx(), Level));
-  D[Level]->calculateDependences(*S);
-  return *D[Level];
-}
-
-void DependenceInfo::abandonDependences() {
-  for (std::unique_ptr<Dependences> &Deps : D)
-    Deps.release();
-}
-
-bool DependenceInfo::runOnScop(Scop &ScopVar) {
-  S = &ScopVar;
-  return false;
-}
-
-/// Print the dependences for the given SCoP to @p OS.
-
-void polly::DependenceInfo::printScop(raw_ostream &OS, Scop &S) const {
-  if (auto d = D[OptAnalysisLevel].get()) {
-    d->print(OS);
-    return;
-  }
-
-  // Otherwise create the dependences on-the-fly and print it
-  Dependences D(S.getSharedIslCtx(), OptAnalysisLevel);
-  D.calculateDependences(S);
-  D.print(OS);
-}
-
-void DependenceInfo::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequiredTransitive<ScopInfoRegionPass>();
-  AU.setPreservesAll();
-}
-
-char DependenceInfo::ID = 0;
-
-Pass *polly::createDependenceInfoPass() { return new DependenceInfo(); }
-
-INITIALIZE_PASS_BEGIN(DependenceInfo, "polly-dependences",
-                      "Polly - Calculate dependences", false, false);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
-INITIALIZE_PASS_END(DependenceInfo, "polly-dependences",
-                    "Polly - Calculate dependences", false, false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from DependenceAnalysis.
-class DependenceInfoPrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  DependenceInfoPrinterLegacyPass() : DependenceInfoPrinterLegacyPass(outs()) {}
-
-  explicit DependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    DependenceInfo &P = getAnalysis<DependenceInfo>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for "
-       << "region: '" << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<DependenceInfo>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char DependenceInfoPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createDependenceInfoPrinterLegacyPass(raw_ostream &OS) {
-  return new DependenceInfoPrinterLegacyPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(DependenceInfoPrinterLegacyPass,
-                      "polly-print-dependences", "Polly - Print dependences",
-                      false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_END(DependenceInfoPrinterLegacyPass, "polly-print-dependences",
-                    "Polly - Print dependences", false, false)
-
-//===----------------------------------------------------------------------===//
-
-const Dependences &
-DependenceInfoWrapperPass::getDependences(Scop *S,
-                                          Dependences::AnalysisLevel Level) {
-  auto It = ScopToDepsMap.find(S);
-  if (It != ScopToDepsMap.end())
-    if (It->second) {
-      if (It->second->getDependenceLevel() == Level)
-        return *It->second;
-    }
-  return recomputeDependences(S, Level);
-}
-
-const Dependences &DependenceInfoWrapperPass::recomputeDependences(
-    Scop *S, Dependences::AnalysisLevel Level) {
-  std::unique_ptr<Dependences> D(new Dependences(S->getSharedIslCtx(), Level));
-  D->calculateDependences(*S);
-  auto Inserted = ScopToDepsMap.insert(std::make_pair(S, std::move(D)));
-  return *Inserted.first->second;
+DependenceAnalysis::Result polly::runDependenceAnalysis(Scop &S) {
+  DependenceAnalysis::Result Result{S, {}};
+  return Result;
 }
-
-bool DependenceInfoWrapperPass::runOnFunction(Function &F) {
-  auto &SI = *getAnalysis<ScopInfoWrapperPass>().getSI();
-  for (auto &It : SI) {
-    assert(It.second && "Invalid SCoP object!");
-    recomputeDependences(It.second.get(), Dependences::AL_Access);
-  }
-  return false;
-}
-
-void DependenceInfoWrapperPass::print(raw_ostream &OS, const Module *M) const {
-  for (auto &It : ScopToDepsMap) {
-    assert((It.first && It.second) && "Invalid Scop or Dependence object!\n");
-    It.second->print(OS);
-  }
-}
-
-void DependenceInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequiredTransitive<ScopInfoWrapperPass>();
-  AU.setPreservesAll();
-}
-
-char DependenceInfoWrapperPass::ID = 0;
-
-Pass *polly::createDependenceInfoWrapperPassPass() {
-  return new DependenceInfoWrapperPass();
-}
-
-INITIALIZE_PASS_BEGIN(
-    DependenceInfoWrapperPass, "polly-function-dependences",
-    "Polly - Calculate dependences for all the SCoPs of a function", false,
-    false)
-INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass);
-INITIALIZE_PASS_END(
-    DependenceInfoWrapperPass, "polly-function-dependences",
-    "Polly - Calculate dependences for all the SCoPs of a function", false,
-    false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from DependenceInfoWrapperPass.
-class DependenceInfoPrinterLegacyFunctionPass final : public FunctionPass {
-public:
-  static char ID;
-
-  DependenceInfoPrinterLegacyFunctionPass()
-      : DependenceInfoPrinterLegacyFunctionPass(outs()) {}
-
-  explicit DependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS)
-      : FunctionPass(ID), OS(OS) {}
-
-  bool runOnFunction(Function &F) override {
-    DependenceInfoWrapperPass &P = getAnalysis<DependenceInfoWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for function '"
-       << F.getName() << "':\n";
-    P.print(OS);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    FunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<DependenceInfoWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char DependenceInfoPrinterLegacyFunctionPass::ID = 0;
-} // namespace
-
-Pass *polly::createDependenceInfoPrinterLegacyFunctionPass(raw_ostream &OS) {
-  return new DependenceInfoPrinterLegacyFunctionPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(
-    DependenceInfoPrinterLegacyFunctionPass, "polly-print-function-dependences",
-    "Polly - Print dependences for all the SCoPs of a function", false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfoWrapperPass);
-INITIALIZE_PASS_END(DependenceInfoPrinterLegacyFunctionPass,
-                    "polly-print-function-dependences",
-                    "Polly - Print dependences for all the SCoPs of a function",
-                    false, false)
diff --git a/polly/lib/Analysis/PruneUnprofitable.cpp b/polly/lib/Analysis/PruneUnprofitable.cpp
index f8469c03fe55b..40cc9178da0f3 100644
--- a/polly/lib/Analysis/PruneUnprofitable.cpp
+++ b/polly/lib/Analysis/PruneUnprofitable.cpp
@@ -55,8 +55,9 @@ static void updateStatistics(Scop &S, bool Pruned) {
     NumAffineLoops += ScopStats.NumAffineLoops;
   }
 }
+} // namespace
 
-static bool runPruneUnprofitable(Scop &S) {
+bool polly::runPruneUnprofitable(Scop &S) {
   if (PollyProcessUnprofitable) {
     POLLY_DEBUG(
         dbgs() << "NOTE: -polly-process-unprofitable active, won't prune "
@@ -79,35 +80,6 @@ static bool runPruneUnprofitable(Scop &S) {
   return false;
 }
 
-class PruneUnprofitableWrapperPass final : public ScopPass {
-public:
-  static char ID;
-
-  explicit PruneUnprofitableWrapperPass() : ScopPass(ID) {}
-  PruneUnprofitableWrapperPass(const PruneUnprofitableWrapperPass &) = delete;
-  PruneUnprofitableWrapperPass &
-  operator=(const PruneUnprofitableWrapperPass &) = delete;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScopInfoRegionPass>();
-    AU.setPreservesAll();
-  }
-
-  bool runOnScop(Scop &S) override { return runPruneUnprofitable(S); }
-};
-} // namespace
-
-char PruneUnprofitableWrapperPass::ID;
-
-Pass *polly::createPruneUnprofitableWrapperPass() {
-  return new PruneUnprofitableWrapperPass();
-}
-
-INITIALIZE_PASS_BEGIN(PruneUnprofitableWrapperPass, "polly-prune-unprofitable",
-                      "Polly - Prune unprofitable SCoPs", false, false)
-INITIALIZE_PASS_END(PruneUnprofitableWrapperPass, "polly-prune-unprofitable",
-                    "Polly - Prune unprofitable SCoPs", false, false)
-
 llvm::PreservedAnalyses
 PruneUnprofitablePass::run(Scop &S, ScopAnalysisManager &SAM,
                            ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 67a4c43455809..60a1e00916750 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
+#include <deque>
 
 using namespace llvm;
 using namespace polly;
diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp
index 43ed8636b054b..29e89348125f2 100644
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@@ -44,7 +44,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScopDetection.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopDetectionDiagnostic.h"
 #include "polly/Support/SCEVValidator.h"
@@ -75,8 +74,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1983,53 +1980,12 @@ void ScopDetection::verifyAnalysis() {
     verifyRegion(*R);
 }
 
-bool ScopDetectionWrapperPass::runOnFunction(Function &F) {
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  auto &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
-  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-  Result = std::make_unique<ScopDetection>(DT, SE, LI, RI, AA, ORE);
-  Result->detect(F);
-  return false;
-}
-
-void ScopDetectionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-  // We also need AA and RegionInfo when we are verifying analysis.
-  AU.addRequiredTransitive<AAResultsWrapperPass>();
-  AU.addRequiredTransitive<RegionInfoPass>();
-  AU.setPreservesAll();
-}
-
-void ScopDetectionWrapperPass::print(raw_ostream &OS, const Module *) const {
-  for (const Region *R : Result->ValidRegions)
-    OS << "Valid Region for Scop: " << R->getNameStr() << '\n';
-
-  OS << "\n";
-}
-
-ScopDetectionWrapperPass::ScopDetectionWrapperPass() : FunctionPass(ID) {
-  // Disable runtime alias checks if we ignore aliasing all together.
-  if (IgnoreAliasing)
-    PollyUseRuntimeAliasChecks = false;
-}
-
 ScopAnalysis::ScopAnalysis() {
   // Disable runtime alias checks if we ignore aliasing all together.
   if (IgnoreAliasing)
     PollyUseRuntimeAliasChecks = false;
 }
 
-void ScopDetectionWrapperPass::releaseMemory() { Result.reset(); }
-
-char ScopDetectionWrapperPass::ID;
-
 AnalysisKey ScopAnalysis::Key;
 
 ScopDetection ScopAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
@@ -2055,66 +2011,3 @@ PreservedAnalyses ScopAnalysisPrinterPass::run(Function &F,
   OS << "\n";
   return PreservedAnalyses::all();
 }
-
-Pass *polly::createScopDetectionWrapperPassPass() {
-  return new ScopDetectionWrapperPass();
-}
-
-INITIALIZE_PASS_BEGIN(ScopDetectionWrapperPass, "polly-detect",
-                      "Polly - Detect static control parts (SCoPs)", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
-INITIALIZE_PASS_END(ScopDetectionWrapperPass, "polly-detect",
-                    "Polly - Detect static control parts (SCoPs)", false, false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from ScopDetectionWrapperPass.
-class ScopDetectionPrinterLegacyPass final : public FunctionPass {
-public:
-  static char ID;
-
-  ScopDetectionPrinterLegacyPass() : ScopDetectionPrinterLegacyPass(outs()) {}
-
-  explicit ScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS)
-      : FunctionPass(ID), OS(OS) {}
-
-  bool runOnFunction(Function &F) override {
-    ScopDetectionWrapperPass &P = getAnalysis<ScopDetectionWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for function '"
-       << F.getName() << "':\n";
-    P.print(OS);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    FunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<ScopDetectionWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char ScopDetectionPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createScopDetectionPrinterLegacyPass(raw_ostream &OS) {
-  return new ScopDetectionPrinterLegacyPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(ScopDetectionPrinterLegacyPass, "polly-print-detect",
-                      "Polly - Print static control parts (SCoPs)", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_END(ScopDetectionPrinterLegacyPass, "polly-print-detect",
-                    "Polly - Print static control parts (SCoPs)", false, false)
diff --git a/polly/lib/Analysis/ScopGraphPrinter.cpp b/polly/lib/Analysis/ScopGraphPrinter.cpp
index eb6c995f0bb91..29e212882cefe 100644
--- a/polly/lib/Analysis/ScopGraphPrinter.cpp
+++ b/polly/lib/Analysis/ScopGraphPrinter.cpp
@@ -14,20 +14,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScopGraphPrinter.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/ScopDetection.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace polly;
 using namespace llvm;
-static cl::opt<std::string>
-    ViewFilter("polly-view-only",
-               cl::desc("Only view functions that match this pattern"),
-               cl::Hidden, cl::init(""));
 
-static cl::opt<bool> ViewAll("polly-view-all",
-                             cl::desc("Also show functions without any scops"),
-                             cl::Hidden, cl::init(false));
+namespace polly {
+std::string ViewFilter;
+bool ViewAll;
+} // namespace polly
+
+static cl::opt<std::string, true>
+    XViewFilter("polly-view-only",
+                cl::desc("Only view functions that match this pattern"),
+                cl::location(ViewFilter), cl::Hidden, cl::init(""));
+
+static cl::opt<bool, true>
+    XViewAll("polly-view-all",
+             cl::desc("Also show functions without any scops"),
+             cl::location(ViewAll), cl::Hidden, cl::init(false));
 
 namespace llvm {
 
@@ -134,104 +140,6 @@ void DOTGraphTraits<ScopDetection *>::addCustomGraphFeatures(
 
 } // namespace llvm
 
-struct ScopDetectionAnalysisGraphTraits {
-  static ScopDetection *getGraph(ScopDetectionWrapperPass *Analysis) {
-    return &Analysis->getSD();
-  }
-};
-
-struct ScopViewerWrapperPass
-    : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
-                                      ScopDetection *,
-                                      ScopDetectionAnalysisGraphTraits> {
-  static char ID;
-  ScopViewerWrapperPass()
-      : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
-                                        ScopDetection *,
-                                        ScopDetectionAnalysisGraphTraits>(
-            "scops", ID) {}
-  bool processFunction(Function &F, ScopDetectionWrapperPass &SD) override {
-    if (ViewFilter != "" && !F.getName().count(ViewFilter))
-      return false;
-
-    if (ViewAll)
-      return true;
-
-    // Check that at least one scop was detected.
-    return std::distance(SD.getSD().begin(), SD.getSD().end()) > 0;
-  }
-};
-char ScopViewerWrapperPass::ID = 0;
-
-struct ScopOnlyViewerWrapperPass
-    : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
-                                      ScopDetection *,
-                                      ScopDetectionAnalysisGraphTraits> {
-  static char ID;
-  ScopOnlyViewerWrapperPass()
-      : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
-                                        ScopDetection *,
-                                        ScopDetectionAnalysisGraphTraits>(
-            "scopsonly", ID) {}
-};
-char ScopOnlyViewerWrapperPass::ID = 0;
-
-struct ScopPrinterWrapperPass
-    : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false,
-                                       ScopDetection *,
-                                       ScopDetectionAnalysisGraphTraits> {
-  static char ID;
-  ScopPrinterWrapperPass()
-      : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false,
-                                         ScopDetection *,
-                                         ScopDetectionAnalysisGraphTraits>(
-            "scops", ID) {}
-};
-char ScopPrinterWrapperPass::ID = 0;
-
-struct ScopOnlyPrinterWrapperPass
-    : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true,
-                                       ScopDetection *,
-                                       ScopDetectionAnalysisGraphTraits> {
-  static char ID;
-  ScopOnlyPrinterWrapperPass()
-      : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true,
-                                         ScopDetection *,
-                                         ScopDetectionAnalysisGraphTraits>(
-            "scopsonly", ID) {}
-};
-char ScopOnlyPrinterWrapperPass::ID = 0;
-
-static RegisterPass<ScopViewerWrapperPass> X("view-scops",
-                                             "Polly - View Scops of function");
-
-static RegisterPass<ScopOnlyViewerWrapperPass>
-    Y("view-scops-only",
-      "Polly - View Scops of function (with no function bodies)");
-
-static RegisterPass<ScopPrinterWrapperPass>
-    M("dot-scops", "Polly - Print Scops of function");
-
-static RegisterPass<ScopOnlyPrinterWrapperPass>
-    N("dot-scops-only",
-      "Polly - Print Scops of function (with no function bodies)");
-
-Pass *polly::createDOTViewerWrapperPass() {
-  return new ScopViewerWrapperPass();
-}
-
-Pass *polly::createDOTOnlyViewerWrapperPass() {
-  return new ScopOnlyViewerWrapperPass();
-}
-
-Pass *polly::createDOTPrinterWrapperPass() {
-  return new ScopPrinterWrapperPass();
-}
-
-Pass *polly::createDOTOnlyPrinterWrapperPass() {
-  return new ScopOnlyPrinterWrapperPass();
-}
-
 bool ScopViewer::processFunction(Function &F, const ScopDetection &SD) {
   if (ViewFilter != "" && !F.getName().count(ViewFilter))
     return false;
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index 8c6a2360a249b..70e184d3f897f 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -17,7 +17,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScopInfo.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopBuilder.h"
 #include "polly/ScopDetection.h"
@@ -57,7 +56,6 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -2544,19 +2542,6 @@ raw_ostream &polly::operator<<(raw_ostream &OS, const Scop &scop) {
   return OS;
 }
 
-//===----------------------------------------------------------------------===//
-void ScopInfoRegionPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<RegionInfoPass>();
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
-  AU.addRequiredTransitive<ScopDetectionWrapperPass>();
-  AU.addRequired<AAResultsWrapperPass>();
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-  AU.setPreservesAll();
-}
-
 void updateLoopCountStatistic(ScopDetection::LoopStats Stats,
                               Scop::ScopStatistics ScopStats) {
   assert(Stats.NumLoops == ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops);
@@ -2592,112 +2577,6 @@ void updateLoopCountStatistic(ScopDetection::LoopStats Stats,
   NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops;
 }
 
-bool ScopInfoRegionPass::runOnRegion(Region *R, RGPassManager &RGM) {
-  auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD();
-
-  if (!SD.isMaxRegionInScop(*R))
-    return false;
-
-  Function *F = R->getEntry()->getParent();
-  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-  auto const &DL = F->getParent()->getDataLayout();
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
-  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-  ScopBuilder SB(R, AC, AA, DL, DT, LI, SD, SE, ORE);
-  S = SB.getScop(); // take ownership of scop object
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
-  if (S) {
-    ScopDetection::LoopStats Stats =
-        ScopDetection::countBeneficialLoops(&S->getRegion(), SE, LI, 0);
-    updateLoopCountStatistic(Stats, S->getStatistics());
-  }
-#endif
-
-  return false;
-}
-
-void ScopInfoRegionPass::print(raw_ostream &OS, const Module *) const {
-  if (S)
-    S->print(OS, PollyPrintInstructions);
-  else
-    OS << "Invalid Scop!\n";
-}
-
-char ScopInfoRegionPass::ID = 0;
-
-Pass *polly::createScopInfoRegionPassPass() { return new ScopInfoRegionPass(); }
-
-INITIALIZE_PASS_BEGIN(ScopInfoRegionPass, "polly-scops",
-                      "Polly - Create polyhedral description of Scops", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_END(ScopInfoRegionPass, "polly-scops",
-                    "Polly - Create polyhedral description of Scops", false,
-                    false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// Print result from ScopInfoRegionPass.
-class ScopInfoPrinterLegacyRegionPass final : public RegionPass {
-public:
-  static char ID;
-
-  ScopInfoPrinterLegacyRegionPass() : ScopInfoPrinterLegacyRegionPass(outs()) {}
-
-  explicit ScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS)
-      : RegionPass(ID), OS(OS) {}
-
-  bool runOnRegion(Region *R, RGPassManager &RGM) override {
-    ScopInfoRegionPass &P = getAnalysis<ScopInfoRegionPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << R->getNameStr() << "' in function '"
-       << R->getEntry()->getParent()->getName() << "':\n";
-    P.print(OS);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    RegionPass::getAnalysisUsage(AU);
-    AU.addRequired<ScopInfoRegionPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char ScopInfoPrinterLegacyRegionPass::ID = 0;
-} // namespace
-
-Pass *polly::createScopInfoPrinterLegacyRegionPass(raw_ostream &OS) {
-  return new ScopInfoPrinterLegacyRegionPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(ScopInfoPrinterLegacyRegionPass, "polly-print-scops",
-                      "Polly - Print polyhedral description of Scops", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
-INITIALIZE_PASS_END(ScopInfoPrinterLegacyRegionPass, "polly-print-scops",
-                    "Polly - Print polyhedral description of Scops", false,
-                    false)
-
-//===----------------------------------------------------------------------===//
-
 ScopInfo::ScopInfo(const DataLayout &DL, ScopDetection &SD, ScalarEvolution &SE,
                    LoopInfo &LI, AliasAnalysis &AA, DominatorTree &DT,
                    AssumptionCache &AC, OptimizationRemarkEmitter &ORE)
@@ -2771,110 +2650,3 @@ PreservedAnalyses ScopInfoPrinterPass::run(Function &F,
   }
   return PreservedAnalyses::all();
 }
-
-void ScopInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfoWrapperPass>();
-  AU.addRequired<RegionInfoPass>();
-  AU.addRequired<DominatorTreeWrapperPass>();
-  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
-  AU.addRequiredTransitive<ScopDetectionWrapperPass>();
-  AU.addRequired<AAResultsWrapperPass>();
-  AU.addRequired<AssumptionCacheTracker>();
-  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-  AU.setPreservesAll();
-}
-
-bool ScopInfoWrapperPass::runOnFunction(Function &F) {
-  auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD();
-  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-  auto const &DL = F.getParent()->getDataLayout();
-  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
-  Result.reset(new ScopInfo{DL, SD, SE, LI, AA, DT, AC, ORE});
-  return false;
-}
-
-void ScopInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
-  for (auto &It : *Result) {
-    if (It.second)
-      It.second->print(OS, PollyPrintInstructions);
-    else
-      OS << "Invalid Scop!\n";
-  }
-}
-
-char ScopInfoWrapperPass::ID = 0;
-
-Pass *polly::createScopInfoWrapperPassPass() {
-  return new ScopInfoWrapperPass();
-}
-
-INITIALIZE_PASS_BEGIN(
-    ScopInfoWrapperPass, "polly-function-scops",
-    "Polly - Create polyhedral description of all Scops of a function", false,
-    false);
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_END(
-    ScopInfoWrapperPass, "polly-function-scops",
-    "Polly - Create polyhedral description of all Scops of a function", false,
-    false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from ScopInfoWrapperPass.
-class ScopInfoPrinterLegacyFunctionPass final : public FunctionPass {
-public:
-  static char ID;
-
-  ScopInfoPrinterLegacyFunctionPass()
-      : ScopInfoPrinterLegacyFunctionPass(outs()) {}
-  explicit ScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS)
-      : FunctionPass(ID), OS(OS) {}
-
-  bool runOnFunction(Function &F) override {
-    ScopInfoWrapperPass &P = getAnalysis<ScopInfoWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for function '"
-       << F.getName() << "':\n";
-    P.print(OS);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    FunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<ScopInfoWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char ScopInfoPrinterLegacyFunctionPass::ID = 0;
-} // namespace
-
-Pass *polly::createScopInfoPrinterLegacyFunctionPass(raw_ostream &OS) {
-  return new ScopInfoPrinterLegacyFunctionPass(OS);
-}
-
-INITIALIZE_PASS_BEGIN(
-    ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops",
-    "Polly - Print polyhedral description of all Scops of a function", false,
-    false);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass);
-INITIALIZE_PASS_END(
-    ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops",
-    "Polly - Print polyhedral description of all Scops of a function", false,
-    false)
diff --git a/polly/lib/Analysis/ScopPass.cpp b/polly/lib/Analysis/ScopPass.cpp
index 719cd0f6984e0..61417e799cfa5 100644
--- a/polly/lib/Analysis/ScopPass.cpp
+++ b/polly/lib/Analysis/ScopPass.cpp
@@ -24,42 +24,6 @@
 using namespace llvm;
 using namespace polly;
 
-bool ScopPass::runOnRegion(Region *R, RGPassManager &RGM) {
-  S = nullptr;
-
-  if (skipRegion(*R))
-    return false;
-
-  if ((S = getAnalysis<ScopInfoRegionPass>().getScop()))
-    return runOnScop(*S);
-
-  return false;
-}
-
-void ScopPass::print(raw_ostream &OS, const Module *M) const {
-  if (S)
-    printScop(OS, *S);
-}
-
-void ScopPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ScopInfoRegionPass>();
-
-  AU.addPreserved<AAResultsWrapperPass>();
-  AU.addPreserved<BasicAAWrapperPass>();
-  AU.addPreserved<LoopInfoWrapperPass>();
-  AU.addPreserved<DominatorTreeWrapperPass>();
-  AU.addPreserved<GlobalsAAWrapperPass>();
-  AU.addPreserved<ScopDetectionWrapperPass>();
-  AU.addPreserved<ScalarEvolutionWrapperPass>();
-  AU.addPreserved<SCEVAAWrapperPass>();
-  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
-  AU.addPreserved<LazyBlockFrequencyInfoPass>();
-  AU.addPreserved<LazyBranchProbabilityInfoPass>();
-  AU.addPreserved<RegionInfoPass>();
-  AU.addPreserved<ScopInfoRegionPass>();
-  AU.addPreserved<TargetTransformInfoWrapperPass>();
-}
-
 namespace polly {
 template class OwningInnerAnalysisManagerProxy<ScopAnalysisManager, Function>;
 }
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
index 0ed673815ff34..e4f196f151c9e 100644
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -60,6 +60,9 @@ add_llvm_pass_plugin(Polly
   CodeGen/RuntimeDebugBuilder.cpp
   CodeGen/PerfMonitor.cpp
   Exchange/JSONExporter.cpp
+  Pass/PhaseManager.cpp
+  Pass/PollyFunctionPass.cpp
+  Pass/PollyModulePass.cpp
   Support/GICHelper.cpp
   Support/PollyDebug.cpp
   Support/SCEVAffinator.cpp
diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
index 2d8b393cc039c..f2d5a3422849e 100644
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@@ -25,7 +25,6 @@
 #include "polly/CodeGen/PerfMonitor.h"
 #include "polly/CodeGen/Utils.h"
 #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/Support/ScopHelper.h"
@@ -37,7 +36,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -314,59 +312,6 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
   return true;
 }
 
-namespace {
-
-class CodeGeneration final : public ScopPass {
-public:
-  static char ID;
-
-  /// The data layout used.
-  const DataLayout *DL;
-
-  /// @name The analysis passes we need to generate code.
-  ///
-  ///{
-  LoopInfo *LI;
-  IslAstInfo *AI;
-  DominatorTree *DT;
-  ScalarEvolution *SE;
-  RegionInfo *RI;
-  ///}
-
-  CodeGeneration() : ScopPass(ID) {}
-
-  /// Generate LLVM-IR for the SCoP @p S.
-  bool runOnScop(Scop &S) override {
-    AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
-    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    DL = &S.getFunction().getDataLayout();
-    RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
-    return generateCode(S, *AI, *LI, *DT, *SE, *RI);
-  }
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<IslAstInfoWrapperPass>();
-    AU.addRequired<RegionInfoPass>();
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.addRequired<ScopDetectionWrapperPass>();
-    AU.addRequired<ScopInfoRegionPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-
-    AU.addPreserved<DependenceInfo>();
-    AU.addPreserved<IslAstInfoWrapperPass>();
-
-    // FIXME: We do not yet add regions for the newly generated code to the
-    //        region tree.
-  }
-};
-} // namespace
-
 PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM,
                                           ScopStandardAnalysisResults &AR,
                                           SPMUpdater &U) {
@@ -379,17 +324,6 @@ PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-char CodeGeneration::ID = 1;
-
-Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); }
-
-INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
-                      "Polly - Create LLVM-IR from SCoPs", false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
-INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
-                    "Polly - Create LLVM-IR from SCoPs", false, false)
+bool polly::runCodeGeneration(Scop &S, RegionInfo &RI, IslAstInfo &AI) {
+  return generateCode(S, AI, *S.getLI(), *S.getDT(), *S.getSE(), RI);
+}
diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp
index 09bacda196742..3177cda225f1d 100644
--- a/polly/lib/CodeGen/IslAst.cpp
+++ b/polly/lib/CodeGen/IslAst.cpp
@@ -29,7 +29,6 @@
 #include "polly/CodeGen/IslAst.h"
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopDetection.h"
 #include "polly/ScopInfo.h"
@@ -83,6 +82,11 @@ static cl::opt<bool> DetectParallel("polly-ast-detect-parallel",
                                     cl::desc("Detect parallelism"), cl::Hidden,
                                     cl::cat(PollyCategory));
 
+static cl::opt<bool>
+    PollyPrintAst("polly-print-ast",
+                  cl::desc("Print the ISL abstract syntax tree"),
+                  cl::cat(PollyCategory));
+
 STATISTIC(ScopsProcessed, "Number of SCoPs processed");
 STATISTIC(ScopsBeneficial, "Number of beneficial SCoPs");
 STATISTIC(BeneficialAffineLoops, "Number of beneficial affine loops");
@@ -776,90 +780,19 @@ PreservedAnalyses IslAstPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-void IslAstInfoWrapperPass::releaseMemory() { Ast.reset(); }
-
-bool IslAstInfoWrapperPass::runOnScop(Scop &Scop) {
-  auto GetDeps = [this](Dependences::AnalysisLevel Lvl) -> const Dependences & {
-    return getAnalysis<DependenceInfo>().getDependences(Lvl);
+std::unique_ptr<IslAstInfo>
+polly::runIslAstGen(Scop &S, DependenceAnalysis::Result &DA) {
+  auto GetDeps = [&](Dependences::AnalysisLevel Lvl) -> const Dependences & {
+    return DA.getDependences(Lvl);
   };
 
-  Ast = runIslAst(Scop, GetDeps);
-
-  return false;
-}
-
-void IslAstInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  // Get the Common analysis usage of ScopPasses.
-  ScopPass::getAnalysisUsage(AU);
-  AU.addRequiredTransitive<ScopInfoRegionPass>();
-  AU.addRequired<DependenceInfo>();
-
-  AU.addPreserved<DependenceInfo>();
-}
-
-void IslAstInfoWrapperPass::printScop(raw_ostream &OS, Scop &S) const {
-  OS << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'"
-     << S.getName() << "' in function '" << S.getFunction().getName() << "':\n";
-  if (Ast)
-    Ast->print(OS);
-}
-
-char IslAstInfoWrapperPass::ID = 0;
-
-Pass *polly::createIslAstInfoWrapperPassPass() {
-  return new IslAstInfoWrapperPass();
-}
-
-INITIALIZE_PASS_BEGIN(IslAstInfoWrapperPass, "polly-ast",
-                      "Polly - Generate an AST of the SCoP (isl)", false,
-                      false);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_END(IslAstInfoWrapperPass, "polly-ast",
-                    "Polly - Generate an AST from the SCoP (isl)", false, false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from IslAstInfoWrapperPass.
-class IslAstInfoPrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  IslAstInfoPrinterLegacyPass() : IslAstInfoPrinterLegacyPass(outs()) {}
-  explicit IslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    IslAstInfoWrapperPass &P = getAnalysis<IslAstInfoWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
+  std::unique_ptr<IslAstInfo> Result = runIslAst(S, GetDeps);
+  if (PollyPrintAst) {
+    outs() << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'"
+           << S.getName() << "' in function '" << S.getFunction().getName()
+           << "':\n";
+    if (Result)
+      Result->print(llvm::outs());
   }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<IslAstInfoWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char IslAstInfoPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createIslAstInfoPrinterLegacyPass(raw_ostream &OS) {
-  return new IslAstInfoPrinterLegacyPass(OS);
+  return Result;
 }
-
-INITIALIZE_PASS_BEGIN(IslAstInfoPrinterLegacyPass, "polly-print-ast",
-                      "Polly - Print the AST from a SCoP (isl)", false, false);
-INITIALIZE_PASS_DEPENDENCY(IslAstInfoWrapperPass);
-INITIALIZE_PASS_END(IslAstInfoPrinterLegacyPass, "polly-print-ast",
-                    "Polly - Print the AST from a SCoP (isl)", false, false)
diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp
index dfd63146edb5e..7d30c030aa6e1 100644
--- a/polly/lib/Exchange/JSONExporter.cpp
+++ b/polly/lib/Exchange/JSONExporter.cpp
@@ -12,7 +12,6 @@
 
 #include "polly/JSONExporter.h"
 #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
@@ -36,6 +35,11 @@ using namespace polly;
 
 #define DEBUG_TYPE "polly-import-jscop"
 
+static cl::opt<bool>
+    PollyPrintImportJscop("polly-print-import-jscop",
+                          cl::desc("Polly - Print Scop import result"),
+                          cl::cat(PollyCategory));
+
 STATISTIC(NewAccessMapFound, "Number of updated access functions");
 
 namespace {
@@ -50,36 +54,6 @@ static cl::opt<std::string>
                   cl::desc("Postfix to append to the import .jsop files."),
                   cl::Hidden, cl::value_desc("File postfix"), cl::ValueRequired,
                   cl::init(""), cl::cat(PollyCategory));
-
-class JSONExporter : public ScopPass {
-public:
-  static char ID;
-  explicit JSONExporter() : ScopPass(ID) {}
-
-  /// Export the SCoP @p S to a JSON file.
-  bool runOnScop(Scop &S) override;
-
-  /// Print the SCoP @p S as it is exported.
-  void printScop(raw_ostream &OS, Scop &S) const override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-class JSONImporter : public ScopPass {
-public:
-  static char ID;
-  std::vector<std::string> NewAccessStrings;
-  explicit JSONImporter() : ScopPass(ID) {}
-  /// Import new access functions for SCoP @p S from a JSON file.
-  bool runOnScop(Scop &S) override;
-
-  /// Print the SCoP @p S and the imported access functions.
-  void printScop(raw_ostream &OS, Scop &S) const override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
 } // namespace
 
 static std::string getFileName(Scop &S, StringRef Suffix = "") {
@@ -742,21 +716,6 @@ static bool importScop(Scop &S, const Dependences &D, const DataLayout &DL,
   return true;
 }
 
-char JSONExporter::ID = 0;
-void JSONExporter::printScop(raw_ostream &OS, Scop &S) const { OS << S; }
-
-bool JSONExporter::runOnScop(Scop &S) {
-  exportScop(S);
-  return false;
-}
-
-void JSONExporter::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequired<ScopInfoRegionPass>();
-}
-
-Pass *polly::createJSONExporterPass() { return new JSONExporter(); }
-
 PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM,
                                       ScopStandardAnalysisResults &SAR,
                                       SPMUpdater &) {
@@ -764,37 +723,6 @@ PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-char JSONImporter::ID = 0;
-
-void JSONImporter::printScop(raw_ostream &OS, Scop &S) const {
-  OS << S;
-  for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(),
-                                                E = NewAccessStrings.end();
-       I != E; I++)
-    OS << "New access function '" << *I << "' detected in JSCOP file\n";
-}
-
-bool JSONImporter::runOnScop(Scop &S) {
-  const Dependences &D =
-      getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
-  const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
-
-  if (!importScop(S, D, DL, &NewAccessStrings))
-    report_fatal_error("Tried to import a malformed jscop file.");
-
-  return false;
-}
-
-void JSONImporter::getAnalysisUsage(AnalysisUsage &AU) const {
-  ScopPass::getAnalysisUsage(AU);
-  AU.addRequired<DependenceInfo>();
-
-  // TODO: JSONImporter should throw away DependenceInfo.
-  AU.addPreserved<DependenceInfo>();
-}
-
-Pass *polly::createJSONImporterPass() { return new JSONImporter(); }
-
 PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM,
                                       ScopStandardAnalysisResults &SAR,
                                       SPMUpdater &) {
@@ -814,68 +742,24 @@ PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PA;
 }
 
-INITIALIZE_PASS_BEGIN(JSONExporter, "polly-export-jscop",
-                      "Polly - Export Scops as JSON"
-                      " (Writes a .jscop file for each Scop)",
-                      false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo)
-INITIALIZE_PASS_END(JSONExporter, "polly-export-jscop",
-                    "Polly - Export Scops as JSON"
-                    " (Writes a .jscop file for each Scop)",
-                    false, false)
-
-INITIALIZE_PASS_BEGIN(JSONImporter, "polly-import-jscop",
-                      "Polly - Import Scops from JSON"
-                      " (Reads a .jscop file for each Scop)",
-                      false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo)
-INITIALIZE_PASS_END(JSONImporter, "polly-import-jscop",
-                    "Polly - Import Scops from JSON"
-                    " (Reads a .jscop file for each Scop)",
-                    false, false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from JSONImporter.
-class JSONImporterPrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()) {}
-  explicit JSONImporterPrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    JSONImporter &P = getAnalysis<JSONImporter>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
+void polly::runImportJSON(Scop &S, DependenceAnalysis::Result &DA) {
+  const Dependences &D = DA.getDependences(Dependences::AL_Statement);
+  const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
+  std::vector<std::string> NewAccessStrings;
+  if (!importScop(S, D, DL, &NewAccessStrings))
+    report_fatal_error("Tried to import a malformed jscop file.");
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<JSONImporter>();
-    AU.setPreservesAll();
+  if (PollyPrintImportJscop) {
+    outs()
+        << "Printing analysis 'Polly - Print Scop import result' for region: '"
+        << S.getRegion().getNameStr() << "' in function '"
+        << S.getFunction().getName() << "':\n";
+    outs() << S;
+    for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(),
+                                                  E = NewAccessStrings.end();
+         I != E; I++)
+      outs() << "New access function '" << *I << "' detected in JSCOP file\n";
   }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char JSONImporterPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) {
-  return new JSONImporterPrinterLegacyPass(OS);
 }
 
-INITIALIZE_PASS_BEGIN(JSONImporterPrinterLegacyPass, "polly-print-import-jscop",
-                      "Polly - Print Scop import result", false, false)
-INITIALIZE_PASS_DEPENDENCY(JSONImporter)
-INITIALIZE_PASS_END(JSONImporterPrinterLegacyPass, "polly-print-import-jscop",
-                    "Polly - Print Scop import result", false, false)
+void polly::runExportJSON(Scop &S) { exportScop(S); }
diff --git a/polly/lib/Pass/PhaseManager.cpp b/polly/lib/Pass/PhaseManager.cpp
new file mode 100644
index 0000000000000..fb76c811859b8
--- /dev/null
+++ b/polly/lib/Pass/PhaseManager.cpp
@@ -0,0 +1,424 @@
+//===------ PhaseManager.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/Pass/PhaseManager.h"
+#include "polly/CodeGen/CodeGeneration.h"
+#include "polly/CodeGen/IslAst.h"
+#include "polly/CodePreparation.h"
+#include "polly/DeLICM.h"
+#include "polly/DeadCodeElimination.h"
+#include "polly/DependenceInfo.h"
+#include "polly/FlattenSchedule.h"
+#include "polly/ForwardOpTree.h"
+#include "polly/JSONExporter.h"
+#include "polly/MaximalStaticExpansion.h"
+#include "polly/PruneUnprofitable.h"
+#include "polly/ScheduleOptimizer.h"
+#include "polly/ScopDetection.h"
+#include "polly/ScopDetectionDiagnostic.h"
+#include "polly/ScopGraphPrinter.h"
+#include "polly/ScopInfo.h"
+#include "polly/Simplify.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "polly-pass"
+
+using namespace polly;
+using namespace llvm;
+
+namespace {
+
+/// Recurse through all subregions and all regions and add them to RQ.
+static void addRegionIntoQueue(Region &R, SmallVector<Region *> &RQ) {
+  RQ.push_back(&R);
+  for (const auto &E : R)
+    addRegionIntoQueue(*E, RQ);
+}
+
+/// The phase pipeline of Polly to be embedded into another pass manager than
+/// runs passes on functions.
+///
+/// Polly holds state besides LLVM-IR (RegionInfo and ScopInfo) between phases
+/// that LLVM pass managers do not consider when scheduling analyses and passes.
+/// That is, the ScopInfo must persist between phases that a pass manager must
+/// not invalidate to recompute later.
+class PhaseManager {
+private:
+  Function &F;
+  FunctionAnalysisManager &FAM;
+  PollyPassOptions Opts;
+
+public:
+  PhaseManager(Function &F, FunctionAnalysisManager &FAM, PollyPassOptions Opts)
+      : F(F), FAM(FAM), Opts(std::move(Opts)) {}
+
+  /// Execute Polly's phases as indicated by the options.
+  bool run() {
+    // Get analyses from the function pass manager.
+    // These must be preserved during all phases so that if processing one SCoP
+    // has finished, the next SCoP can still use them. Recomputing is not an
+    // option because ScopDetection stores references to the old results.
+    // TODO: CodePreparation doesn't actually need these analysis, it just keeps
+    // them up-to-date. If they are not computed yet, can also compute after the
+    // prepare phase.
+    LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    bool ModifiedIR = false;
+
+    // Phase: prepare
+    // TODO: Setting ModifiedIR will invalidate any analysis, even if DT, LI are
+    // preserved.
+    if (Opts.isPhaseEnabled(PassPhase::Prepare)) {
+      PreservedAnalyses PA = CodePreparationPass().run(F, FAM);
+      FAM.invalidate(F, PA);
+      if (!PA.areAllPreserved())
+        ModifiedIR = true;
+    }
+
+    // Can't do anything without detection
+    if (!Opts.isPhaseEnabled(PassPhase::Detection))
+      return false;
+
+    AAResults &AA = FAM.getResult<AAManager>(F);
+    ScalarEvolution &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
+    OptimizationRemarkEmitter &ORE =
+        FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+    // ScopDetection is modifying RegionInfo, do not cache it, nor use a cached
+    // version.
+    RegionInfo RI = RegionInfoAnalysis().run(F, FAM);
+
+    // Phase: detection
+    ScopDetection SD(DT, SE, LI, RI, AA, ORE);
+    SD.detect(F);
+    if (Opts.isPhaseEnabled(PassPhase::PrintDetect)) {
+      outs() << "Detected Scops in Function " << F.getName() << "\n";
+      for (const Region *R : SD.ValidRegions)
+        outs() << "Valid Region for Scop: " << R->getNameStr() << '\n';
+      outs() << "\n";
+    }
+
+    if (Opts.isPhaseEnabled(PassPhase::DotScops))
+      printGraphForFunction(F, &SD, "scops", false);
+    if (Opts.isPhaseEnabled(PassPhase::DotScopsOnly))
+      printGraphForFunction(F, &SD, "scopsonly", true);
+
+    auto ViewScops = [&](const char *Name, bool IsSimply) {
+      if (Opts.ViewFilter.empty() && !F.getName().count(Opts.ViewFilter))
+        return;
+
+      if (Opts.ViewAll || std::distance(SD.begin(), SD.end()) > 0)
+        viewGraphForFunction(F, &SD, Name, IsSimply);
+    };
+    if (Opts.isPhaseEnabled(PassPhase::ViewScops))
+      ViewScops("scops", false);
+    if (Opts.isPhaseEnabled(PassPhase::ViewScopsOnly))
+      ViewScops("scopsonly", true);
+
+    // Phase: scops
+    AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    ScopInfo Info(DL, SD, SE, LI, AA, DT, AC, ORE);
+    if (Opts.isPhaseEnabled(PassPhase::PrintScopInfo)) {
+      if (Region *TLR = RI.getTopLevelRegion()) {
+        SmallVector<Region *> Regions;
+        addRegionIntoQueue(*TLR, Regions);
+
+        // reverse iteration because the regression tests expect it.
+        for (Region *R : reverse(Regions)) {
+          Scop *S = Info.getScop(R);
+          outs() << "Printing analysis 'Polly - Create polyhedral "
+                    "description of Scops' for region: '"
+                 << R->getNameStr() << "' in function '" << F.getName()
+                 << "':\n";
+          if (S)
+            outs() << *S;
+          else
+            outs() << "Invalid Scop!\n";
+        }
+      }
+    }
+
+    SmallPriorityWorklist<Region *, 4> Worklist;
+    for (auto &[R, S] : Info)
+      if (S)
+        Worklist.insert(R);
+
+    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    while (!Worklist.empty()) {
+      Region *R = Worklist.pop_back_val();
+      if (!SD.isMaxRegionInScop(*R, /*Verify=*/false))
+        continue;
+      Scop *S = Info.getScop(R);
+
+      // Phase: flatten
+      if (Opts.isPhaseEnabled(PassPhase::Flatten))
+        runFlattenSchedulePass(*S);
+
+      // Phase: deps
+      // Actual analysis runs on-demand, so it does not matter whether the phase
+      // is actually enabled, but use this location to print dependencies.
+      DependenceAnalysis::Result DA = runDependenceAnalysis(*S);
+      if (Opts.isPhaseEnabled(PassPhase::PrintDependences)) {
+        assert(Opts.isPhaseEnabled(PassPhase::Dependences));
+        const Dependences &D = DA.getDependences(Opts.PrintDepsAnalysisLevel);
+        D.print(outs());
+      }
+
+      // Phase: import-jscop
+      if (Opts.isPhaseEnabled(PassPhase::ImportJScop))
+        runImportJSON(*S, DA);
+
+      // Phase: simplify-0
+      bool ModifiedSinceSimplify = true;
+      if (Opts.isPhaseEnabled(PassPhase::Simplify0)) {
+        runSimplify(*S, 0);
+        ModifiedSinceSimplify = false;
+      }
+
+      // Phase: optree
+      if (Opts.isPhaseEnabled(PassPhase::Optree)) {
+        bool ModifiedByOptree = runForwardOpTree(*S);
+        ModifiedSinceSimplify |= ModifiedByOptree;
+      }
+
+      // Phase: delicm
+      if (Opts.isPhaseEnabled(PassPhase::DeLICM)) {
+        bool ModifiedByDelicm = runDeLICM(*S);
+        ModifiedSinceSimplify |= ModifiedByDelicm;
+      }
+
+      // Phase: simplify-1
+      // If we have already run simplify-0, do not re-run it if the SCoP has not
+      // changed since then.
+      if (ModifiedSinceSimplify && Opts.isPhaseEnabled(PassPhase::Simplify1)) {
+        runSimplify(*S, 1);
+        ModifiedSinceSimplify = false;
+      }
+
+      // Phase: dce
+      if (Opts.isPhaseEnabled(PassPhase::DeadCodeElimination))
+        runDeadCodeElim(*S, DA);
+
+      // Phase: mse
+      if (Opts.isPhaseEnabled(PassPhase::MaximumStaticExtension))
+        runMaximalStaticExpansion(*S, DA);
+
+      // Phase: prune
+      if (Opts.isPhaseEnabled(PassPhase::PruneUnprofitable))
+        runPruneUnprofitable(*S);
+
+      // Phase: opt-isl
+      if (Opts.isPhaseEnabled(PassPhase::Optimization))
+        runIslScheduleOptimizer(*S, &TTI, DA);
+
+      // Phase: import-jscop
+      if (Opts.isPhaseEnabled(PassPhase::ExportJScop))
+        runExportJSON(*S);
+
+      // Phase: ast
+      // Cannot run codegen unless ast is enabled
+      if (!Opts.isPhaseEnabled(PassPhase::AstGen))
+        continue;
+      std::unique_ptr<IslAstInfo> IslAst = runIslAstGen(*S, DA);
+
+      // Phase: codegen
+      if (!Opts.isPhaseEnabled(PassPhase::CodeGen))
+        continue;
+      bool ModifiedByCodeGen = runCodeGeneration(*S, RI, *IslAst);
+      if (ModifiedByCodeGen) {
+        ModifiedIR = true;
+
+        // For all regions, create new polly::Scop objects because the old ones
+        // refere to invalidated LLVM-IR.
+        // FIXME: Adds all SCoPs again to statistics
+        Info.recompute();
+      }
+    }
+
+    return ModifiedIR;
+  }
+};
+} // namespace
+
+StringRef polly::getPhaseName(PassPhase Phase) {
+  switch (Phase) {
+  case PassPhase::Prepare:
+    return "prepare";
+  case PassPhase::Detection:
+    return "detect";
+  case PassPhase::PrintDetect:
+    return "print-detect";
+  case PassPhase::DotScops:
+    return "dot-scops";
+  case PassPhase::DotScopsOnly:
+    return "dot-scops-only";
+  case PassPhase::ViewScops:
+    return "view-scops";
+  case PassPhase::ViewScopsOnly:
+    return "view-scops-only";
+  case PassPhase::ScopInfo:
+    return "scops";
+  case PassPhase::PrintScopInfo:
+    return "print-scops";
+  case PassPhase::Flatten:
+    return "flatten";
+  case PassPhase::Dependences:
+    return "deps";
+  case PassPhase::PrintDependences:
+    return "print-deps";
+  case PassPhase::ImportJScop:
+    return "import-jscop";
+  case PassPhase::Simplify0:
+    return "simplify-0";
+  case PassPhase::Optree:
+    return "optree";
+  case PassPhase::DeLICM:
+    return "delicm";
+  case PassPhase::Simplify1:
+    return "simplify-1";
+  case PassPhase::DeadCodeElimination:
+    return "dce";
+  case PassPhase::MaximumStaticExtension:
+    return "mse";
+  case PassPhase::PruneUnprofitable:
+    return "prune";
+  case PassPhase::Optimization:
+    return "opt-isl"; // "opt" would conflict with the llvm executable
+  case PassPhase::ExportJScop:
+    return "export-jscop";
+  case PassPhase::AstGen:
+    return "ast";
+  case PassPhase::CodeGen:
+    return "codegen";
+  default:
+    llvm_unreachable("Unexpected phase");
+  }
+}
+
+PassPhase polly::parsePhase(StringRef Name) {
+  return StringSwitch<PassPhase>(Name)
+      .Case("prepare", PassPhase::Prepare)
+      .Case("detect", PassPhase::Detection)
+      .Case("print-detect", PassPhase::PrintDetect)
+      .Case("dot-scops", PassPhase::DotScops)
+      .Case("dot-scops-only", PassPhase::DotScopsOnly)
+      .Case("view-scops", PassPhase::ViewScops)
+      .Case("view-scops-only", PassPhase::ViewScopsOnly)
+      .Case("scops", PassPhase::ScopInfo)
+      .Case("print-scops", PassPhase::PrintScopInfo)
+      .Case("flatten", PassPhase::Flatten)
+      .Case("deps", PassPhase::Dependences)
+      .Case("print-deps", PassPhase::PrintDependences)
+      .Case("import-jscop", PassPhase::ImportJScop)
+      .Case("simplify-0", PassPhase::Simplify0)
+      .Case("optree", PassPhase::Optree)
+      .Case("delicm", PassPhase::DeLICM)
+      .Case("simplify-1", PassPhase::Simplify1)
+      .Case("dce", PassPhase::DeadCodeElimination)
+      .Case("mse", PassPhase::MaximumStaticExtension)
+      .Case("prune", PassPhase::PruneUnprofitable)
+      .Case("opt-isl", PassPhase::Optimization)
+      .Case("export-jscop", PassPhase::ExportJScop)
+      .Case("ast", PassPhase::AstGen)
+      .Case("codegen", PassPhase::CodeGen)
+      .Default(PassPhase::None);
+}
+
+bool polly::dependsOnDependenceInfo(PassPhase Phase) {
+  // Nothing before dep phase can depend on it
+  if (static_cast<size_t>(Phase) <= static_cast<size_t>(PassPhase::Dependences))
+    return false;
+
+  switch (Phase) {
+  case PassPhase::Simplify0:
+  case PassPhase::Optree:
+  case PassPhase::DeLICM:
+  case PassPhase::Simplify1:
+  case PassPhase::PruneUnprofitable:
+  case PassPhase::ImportJScop:
+  case PassPhase::ExportJScop:
+  case PassPhase::AstGen: // transitively through codegen
+  case PassPhase::CodeGen:
+    return false;
+  default:
+    return true;
+  }
+}
+
+void PollyPassOptions::enableEnd2End() {
+  setPhaseEnabled(PassPhase::Detection);
+  setPhaseEnabled(PassPhase::ScopInfo);
+  setPhaseEnabled(PassPhase::Dependences);
+  setPhaseEnabled(PassPhase::AstGen);
+  setPhaseEnabled(PassPhase::CodeGen);
+}
+
+void PollyPassOptions::enableDefaultOpts() {
+  setPhaseEnabled(PassPhase::Prepare);
+  setPhaseEnabled(PassPhase::Simplify0);
+  setPhaseEnabled(PassPhase::Optree);
+  setPhaseEnabled(PassPhase::DeLICM);
+  setPhaseEnabled(PassPhase::Simplify1);
+  setPhaseEnabled(PassPhase::PruneUnprofitable);
+  setPhaseEnabled(PassPhase::Optimization);
+}
+
+void PollyPassOptions::disableAfter(PassPhase Phase) {
+  assert(Phase != PassPhase::None);
+  for (PassPhase P : enum_seq_inclusive(Phase, PassPhase::PassPhaseLast)) {
+    if (P == Phase)
+      continue;
+    setPhaseEnabled(P, false);
+  }
+}
+
+Error PollyPassOptions::checkConsistency() const {
+  for (PassPhase P : enum_seq_inclusive(PassPhase::PassPhaseFirst,
+                                        PassPhase::PassPhaseLast)) {
+    if (!isPhaseEnabled(P))
+      continue;
+
+    // Prepare and Detection have no requirements
+    if (P == PassPhase::Prepare || P == PassPhase::Detection)
+      continue;
+
+    if (!isPhaseEnabled(PassPhase::Detection))
+      return make_error<StringError>(
+          formatv("'{0}' requires 'detect' to be enabled", getPhaseName(P))
+              .str(),
+          inconvertibleErrorCode());
+
+    if (static_cast<size_t>(P) < static_cast<size_t>(PassPhase::ScopInfo))
+      continue;
+
+    if (!isPhaseEnabled(PassPhase::ScopInfo))
+      return make_error<StringError>(
+          formatv("'{0}' requires 'scops' to be enabled", getPhaseName(P))
+              .str(),
+          inconvertibleErrorCode());
+
+    if (dependsOnDependenceInfo(P) && !isPhaseEnabled(PassPhase::Dependences))
+      return make_error<StringError>(
+          formatv("'{0}' requires 'deps' to be enabled", getPhaseName(P)).str(),
+          inconvertibleErrorCode());
+  }
+
+  if (isPhaseEnabled(PassPhase::CodeGen) && !isPhaseEnabled(PassPhase::AstGen))
+    return make_error<StringError>("'codegen' requires 'ast' to be enabled",
+                                   inconvertibleErrorCode());
+
+  return Error::success();
+}
+
+bool polly::runPollyPass(Function &F, FunctionAnalysisManager &FAM,
+                         PollyPassOptions Opts) {
+  return PhaseManager(F, FAM, std::move(Opts)).run();
+}
diff --git a/polly/lib/Pass/PollyFunctionPass.cpp b/polly/lib/Pass/PollyFunctionPass.cpp
new file mode 100644
index 0000000000000..a478e4df2ca20
--- /dev/null
+++ b/polly/lib/Pass/PollyFunctionPass.cpp
@@ -0,0 +1,22 @@
+//===------ PollyFunctionPass.cpp - Polly function pass  ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/Pass/PollyFunctionPass.h"
+
+using namespace llvm;
+using namespace polly;
+
+PreservedAnalyses PollyFunctionPass::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &FAM) {
+  bool ModifiedIR = runPollyPass(F, FAM, Opts);
+
+  // Be conservative about preserved analyses.
+  // FIXME: May also need to invalidate/update Module/CGSCC passes, but cannot
+  // reach them within a FunctionPassManager.
+  return ModifiedIR ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/polly/lib/Pass/PollyModulePass.cpp b/polly/lib/Pass/PollyModulePass.cpp
new file mode 100644
index 0000000000000..f56ee672b76af
--- /dev/null
+++ b/polly/lib/Pass/PollyModulePass.cpp
@@ -0,0 +1,29 @@
+//===------ PollyModulePass.cpp - Polly module pass  ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/Pass/PollyModulePass.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+using namespace polly;
+
+PreservedAnalyses PollyModulePass::run(llvm::Module &M,
+                                       llvm::ModuleAnalysisManager &MAM) {
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  bool ModifiedAnyIR = false;
+  for (Function &F : M) {
+    bool LocalModifiedIR = runPollyPass(F, FAM, Opts);
+    ModifiedAnyIR |= LocalModifiedIR;
+  }
+
+  // Be conservative about preserved analyses, especially if parallel functions
+  // have been outlined.
+  return ModifiedAnyIR ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/polly/lib/Support/DumpFunctionPass.cpp b/polly/lib/Support/DumpFunctionPass.cpp
index e47b7fe0db966..9565e2156aee6 100644
--- a/polly/lib/Support/DumpFunctionPass.cpp
+++ b/polly/lib/Support/DumpFunctionPass.cpp
@@ -13,7 +13,6 @@
 #include "polly/Support/DumpFunctionPass.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -82,50 +81,10 @@ static void runDumpFunction(llvm::Function &F, StringRef Suffix) {
   Out->keep();
   LLVM_DEBUG(dbgs() << "Dump file " << Dumpfile << " written successfully\n");
 }
-
-class DumpFunctionWrapperPass final : public FunctionPass {
-private:
-  DumpFunctionWrapperPass(const DumpFunctionWrapperPass &) = delete;
-  const DumpFunctionWrapperPass &
-  operator=(const DumpFunctionWrapperPass &) = delete;
-
-  std::string Suffix;
-
-public:
-  static char ID;
-
-  explicit DumpFunctionWrapperPass() : FunctionPass(ID), Suffix("-dump") {}
-
-  explicit DumpFunctionWrapperPass(std::string Suffix)
-      : FunctionPass(ID), Suffix(std::move(Suffix)) {}
-
-  /// @name FunctionPass interface
-  //@{
-  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-
-  bool runOnFunction(llvm::Function &F) override {
-    runDumpFunction(F, Suffix);
-    return false;
-  }
-  //@}
-};
-
-char DumpFunctionWrapperPass::ID;
 } // namespace
 
-FunctionPass *polly::createDumpFunctionWrapperPass(std::string Suffix) {
-  return new DumpFunctionWrapperPass(std::move(Suffix));
-}
-
 llvm::PreservedAnalyses DumpFunctionPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
   runDumpFunction(F, Suffix);
   return PreservedAnalyses::all();
 }
-
-INITIALIZE_PASS_BEGIN(DumpFunctionWrapperPass, "polly-dump-function",
-                      "Polly - Dump Function", false, false)
-INITIALIZE_PASS_END(DumpFunctionWrapperPass, "polly-dump-function",
-                    "Polly - Dump Function", false, false)
diff --git a/polly/lib/Support/DumpModulePass.cpp b/polly/lib/Support/DumpModulePass.cpp
index c1c27ef6ac757..2eaa0707fe571 100644
--- a/polly/lib/Support/DumpModulePass.cpp
+++ b/polly/lib/Support/DumpModulePass.cpp
@@ -12,7 +12,6 @@
 
 #include "polly/Support/DumpModulePass.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -47,56 +46,10 @@ static void runDumpModule(llvm::Module &M, StringRef Filename, bool IsSuffix) {
   M.print(Out->os(), nullptr);
   Out->keep();
 }
-
-class DumpModuleWrapperPass final : public ModulePass {
-private:
-  DumpModuleWrapperPass(const DumpModuleWrapperPass &) = delete;
-  const DumpModuleWrapperPass &
-  operator=(const DumpModuleWrapperPass &) = delete;
-
-  std::string Filename;
-  bool IsSuffix;
-
-public:
-  static char ID;
-
-  /// This constructor is used e.g. if using opt -polly-dump-module.
-  ///
-  /// Provide a default suffix to not overwrite the original file.
-  explicit DumpModuleWrapperPass()
-      : ModulePass(ID), Filename("-dump"), IsSuffix(true) {}
-
-  explicit DumpModuleWrapperPass(std::string Filename, bool IsSuffix)
-      : ModulePass(ID), Filename(std::move(Filename)), IsSuffix(IsSuffix) {}
-
-  /// @name ModulePass interface
-  //@{
-  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-
-  bool runOnModule(llvm::Module &M) override {
-    runDumpModule(M, Filename, IsSuffix);
-    return false;
-  }
-  //@}
-};
-
-char DumpModuleWrapperPass::ID;
 } // namespace
 
-ModulePass *polly::createDumpModuleWrapperPass(std::string Filename,
-                                               bool IsSuffix) {
-  return new DumpModuleWrapperPass(std::move(Filename), IsSuffix);
-}
-
 llvm::PreservedAnalyses DumpModulePass::run(llvm::Module &M,
                                             llvm::ModuleAnalysisManager &AM) {
   runDumpModule(M, Filename, IsSuffix);
   return PreservedAnalyses::all();
 }
-
-INITIALIZE_PASS_BEGIN(DumpModuleWrapperPass, "polly-dump-module",
-                      "Polly - Dump Module", false, false)
-INITIALIZE_PASS_END(DumpModuleWrapperPass, "polly-dump-module",
-                    "Polly - Dump Module", false, false)
diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def
index 2c792a5867100..496839760a844 100644
--- a/polly/lib/Support/PollyPasses.def
+++ b/polly/lib/Support/PollyPasses.def
@@ -1,3 +1,10 @@
+#ifndef MODULE_PASS
+#define MODULE_PASS(NAME, CREATE_PASS, PARSER)
+#endif
+MODULE_PASS("polly", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyDefaultOptions)
+MODULE_PASS("polly-custom", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyCustomOptions)
+#undef MODULE_PASS
+
 #ifndef CGSCC_PASS
 #define CGSCC_PASS(NAME, CREATE_PASS, PARSER)
 #endif
@@ -12,15 +19,17 @@ FUNCTION_ANALYSIS("polly-function-scops", ScopInfoAnalysis())
 #undef FUNCTION_ANALYSIS
 
 #ifndef FUNCTION_PASS
-#define FUNCTION_PASS(NAME, CREATE_PASS)
+#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER)
 #endif
-FUNCTION_PASS("polly-prepare", CodePreparationPass())
-FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs()))
-FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs()))
-FUNCTION_PASS("polly-scop-viewer", ScopViewer())
-FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer())
-FUNCTION_PASS("polly-scop-printer", ScopPrinter())
-FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter())
+FUNCTION_PASS("polly-prepare", CodePreparationPass(), parseNoOptions)
+FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs()), parseNoOptions)
+FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs()), parseNoOptions)
+FUNCTION_PASS("polly-scop-viewer", ScopViewer(), parseNoOptions)
+FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer(), parseNoOptions)
+FUNCTION_PASS("polly-scop-printer", ScopPrinter(), parseNoOptions)
+FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter(), parseNoOptions)
+FUNCTION_PASS("polly", PollyFunctionPass(Opts), parsePollyDefaultOptions)
+FUNCTION_PASS("polly-custom", PollyFunctionPass(Opts), parsePollyCustomOptions)
 #undef FUNCTION_PASS
 
 #ifndef SCOP_ANALYSIS
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 04f8715502c38..2f1d7a8362349 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -28,8 +28,9 @@
 #include "polly/DependenceInfo.h"
 #include "polly/ForwardOpTree.h"
 #include "polly/JSONExporter.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/MaximalStaticExpansion.h"
+#include "polly/Options.h"
+#include "polly/Pass/PollyFunctionPass.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/ScheduleOptimizer.h"
 #include "polly/ScopDetection.h"
@@ -52,6 +53,8 @@
 #include "llvm/Transforms/IPO.h"
 
 using namespace llvm;
+using namespace polly;
+
 namespace cl = llvm::cl;
 using namespace polly;
 
@@ -201,58 +204,19 @@ static cl::opt<bool> EnablePruneUnprofitable(
     cl::desc("Bail out on unprofitable SCoPs before rescheduling"), cl::Hidden,
     cl::init(true), cl::cat(PollyCategory));
 
-namespace {
+static cl::opt<bool>
+    PollyPrintDetect("polly-print-detect",
+                     cl::desc("Polly - Print static control parts (SCoPs)"),
+                     cl::cat(PollyCategory));
 
-/// Initialize Polly passes when library is loaded.
-///
-/// We use the constructor of a statically declared object to initialize the
-/// different Polly passes right after the Polly library is loaded. This ensures
-/// that the Polly passes are available e.g. in the 'opt' tool.
-struct StaticInitializer {
-  StaticInitializer() {
-    llvm::PassRegistry &Registry = *llvm::PassRegistry::getPassRegistry();
-    polly::initializePollyPasses(Registry);
-  }
-};
-static StaticInitializer InitializeEverything;
-} // end of anonymous namespace.
-
-void initializePollyPasses(llvm::PassRegistry &Registry) {
-  initializeCodeGenerationPass(Registry);
-
-  initializeCodePreparationPass(Registry);
-  initializeDeadCodeElimWrapperPassPass(Registry);
-  initializeDependenceInfoPass(Registry);
-  initializeDependenceInfoPrinterLegacyPassPass(Registry);
-  initializeDependenceInfoWrapperPassPass(Registry);
-  initializeDependenceInfoPrinterLegacyFunctionPassPass(Registry);
-  initializeJSONExporterPass(Registry);
-  initializeJSONImporterPass(Registry);
-  initializeJSONImporterPrinterLegacyPassPass(Registry);
-  initializeMaximalStaticExpanderWrapperPassPass(Registry);
-  initializeIslAstInfoWrapperPassPass(Registry);
-  initializeIslAstInfoPrinterLegacyPassPass(Registry);
-  initializeIslScheduleOptimizerWrapperPassPass(Registry);
-  initializeIslScheduleOptimizerPrinterLegacyPassPass(Registry);
-  initializePollyCanonicalizePass(Registry);
-  initializeScopDetectionWrapperPassPass(Registry);
-  initializeScopDetectionPrinterLegacyPassPass(Registry);
-  initializeScopInlinerWrapperPassPass(Registry);
-  initializeScopInfoRegionPassPass(Registry);
-  initializeScopInfoPrinterLegacyRegionPassPass(Registry);
-  initializeScopInfoWrapperPassPass(Registry);
-  initializeScopInfoPrinterLegacyFunctionPassPass(Registry);
-  initializeFlattenSchedulePass(Registry);
-  initializeFlattenSchedulePrinterLegacyPassPass(Registry);
-  initializeForwardOpTreeWrapperPassPass(Registry);
-  initializeForwardOpTreePrinterLegacyPassPass(Registry);
-  initializeDeLICMWrapperPassPass(Registry);
-  initializeDeLICMPrinterLegacyPassPass(Registry);
-  initializeSimplifyWrapperPassPass(Registry);
-  initializeSimplifyPrinterLegacyPassPass(Registry);
-  initializeDumpModuleWrapperPassPass(Registry);
-  initializePruneUnprofitableWrapperPassPass(Registry);
-}
+static cl::opt<bool>
+    PollyPrintScops("polly-print-scops",
+                    cl::desc("Print polyhedral description of all regions"),
+                    cl::cat(PollyCategory));
+
+static cl::opt<bool> PollyPrintDeps("polly-print-deps",
+                                    cl::desc("Polly - Print dependences"),
+                                    cl::cat(PollyCategory));
 
 static bool shouldEnablePollyForOptimization() { return PollyEnabled; }
 
@@ -266,6 +230,198 @@ static bool shouldEnablePollyForDiagnostic() {
          ExportJScop;
 }
 
+/// Parser of parameters for LoopVectorize pass.
+static llvm::Expected<PollyPassOptions> parsePollyOptions(StringRef Params,
+                                                          bool IsCustom) {
+  PassPhase PrevPhase = PassPhase::None;
+
+  bool EnableDefaultOpts = !IsCustom;
+  bool EnableEnd2End = !IsCustom;
+  std::optional<bool>
+      PassEnabled[static_cast<size_t>(PassPhase::PassPhaseLast) + 1];
+  PassPhase StopAfter = PassPhase::None;
+
+  // Passes enabled using command-line flags (can be overridden using
+  // 'polly<no-pass>')
+  if (PollyPrintDetect)
+    PassEnabled[static_cast<size_t>(PassPhase::PrintDetect)] = true;
+  if (PollyPrintScops)
+    PassEnabled[static_cast<size_t>(PassPhase::PrintScopInfo)] = true;
+  if (PollyPrintDeps)
+    PassEnabled[static_cast<size_t>(PassPhase::PrintDependences)] = true;
+
+  if (PollyViewer)
+    PassEnabled[static_cast<size_t>(PassPhase::ViewScops)] = true;
+  if (PollyOnlyViewer)
+    PassEnabled[static_cast<size_t>(PassPhase::ViewScopsOnly)] = true;
+  if (PollyPrinter)
+    PassEnabled[static_cast<size_t>(PassPhase::DotScops)] = true;
+  if (PollyOnlyPrinter)
+    PassEnabled[static_cast<size_t>(PassPhase::DotScopsOnly)] = true;
+  if (!EnableSimplify)
+    PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = false;
+  if (!EnableForwardOpTree)
+    PassEnabled[static_cast<size_t>(PassPhase::Optree)] = false;
+  if (!EnableDeLICM)
+    PassEnabled[static_cast<size_t>(PassPhase::DeLICM)] = false;
+  if (!EnableSimplify)
+    PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = false;
+  if (ImportJScop)
+    PassEnabled[static_cast<size_t>(PassPhase::ImportJScop)] = true;
+  if (DeadCodeElim)
+    PassEnabled[static_cast<size_t>(PassPhase::DeadCodeElimination)] = true;
+  if (FullyIndexedStaticExpansion)
+    PassEnabled[static_cast<size_t>(PassPhase::MaximumStaticExtension)] = true;
+  if (!EnablePruneUnprofitable)
+    PassEnabled[static_cast<size_t>(PassPhase::PruneUnprofitable)] = false;
+  switch (Optimizer) {
+  case OPTIMIZER_NONE:
+    // explicitly switched off
+    PassEnabled[static_cast<size_t>(PassPhase::Optimization)] = false;
+    break;
+  case OPTIMIZER_ISL:
+    // default: enabled
+    break;
+  }
+  if (ExportJScop)
+    PassEnabled[static_cast<size_t>(PassPhase::ExportJScop)] = true;
+  switch (CodeGeneration) {
+  case CODEGEN_AST:
+    PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = true;
+    PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false;
+    break;
+  case CODEGEN_FULL:
+    // default: ast and codegen enabled
+    break;
+  case CODEGEN_NONE:
+    PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = false;
+    PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false;
+    break;
+  }
+
+  while (!Params.empty()) {
+    StringRef Param;
+    std::tie(Param, Params) = Params.split(';');
+    auto [ParamName, ParamVal] = Param.split('=');
+
+    if (ParamName == "stopafter") {
+      StopAfter = parsePhase(ParamVal);
+      if (StopAfter == PassPhase::None)
+        return make_error<StringError>(
+            formatv("invalid stopafter parameter value '{0}'", ParamVal).str(),
+            inconvertibleErrorCode());
+      continue;
+    }
+
+    if (!ParamVal.empty())
+      return make_error<StringError>(
+          formatv("parameter '{0}' does not take value", ParamName).str(),
+          inconvertibleErrorCode());
+
+    bool Enabled = true;
+    if (ParamName.starts_with("no-")) {
+      Enabled = false;
+      ParamName = ParamName.drop_front(3);
+    }
+
+    if (ParamName == "default-opts") {
+      EnableDefaultOpts = Enabled;
+      continue;
+    }
+
+    if (ParamName == "end2end") {
+      EnableEnd2End = Enabled;
+      continue;
+    }
+
+    PassPhase Phase;
+
+    // Shortcut for both simplifys at the same time
+    if (ParamName == "simplify") {
+      PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = Enabled;
+      PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = Enabled;
+      Phase = PassPhase::Simplify0;
+    } else {
+      Phase = parsePhase(ParamName);
+      if (Phase == PassPhase::None)
+        return make_error<StringError>(
+            formatv("invalid Polly parameter/phase name '{0}'", ParamName)
+                .str(),
+            inconvertibleErrorCode());
+
+      if (PrevPhase >= Phase)
+        return make_error<StringError>(
+            formatv("phases must not be repeated and enumerated in-order: "
+                    "'{0}' listed before '{1}'",
+                    getPhaseName(PrevPhase), getPhaseName(Phase))
+                .str(),
+            inconvertibleErrorCode());
+
+      PassEnabled[static_cast<size_t>(Phase)] = Enabled;
+    }
+    PrevPhase = Phase;
+  }
+
+  PollyPassOptions Opts;
+  Opts.ViewAll = ViewAll;
+  Opts.ViewFilter = ViewFilter;
+  Opts.PrintDepsAnalysisLevel = OptAnalysisLevel;
+
+  // Implicitly enable dependent phases first. May be overriden explicitly
+  // on/off later.
+  for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst,
+                                              PassPhase::PassPhaseLast)) {
+    bool Enabled = PassEnabled[static_cast<size_t>(P)].value_or(false);
+    if (!Enabled)
+      continue;
+
+    if (static_cast<size_t>(PassPhase::Detection) < static_cast<size_t>(P))
+      Opts.setPhaseEnabled(PassPhase::Detection);
+
+    if (static_cast<size_t>(PassPhase::ScopInfo) < static_cast<size_t>(P))
+      Opts.setPhaseEnabled(PassPhase::ScopInfo);
+
+    if (dependsOnDependenceInfo(P))
+      Opts.setPhaseEnabled(PassPhase::Dependences);
+
+    if (static_cast<size_t>(PassPhase::AstGen) < static_cast<size_t>(P))
+      Opts.setPhaseEnabled(PassPhase::AstGen);
+  }
+
+  if (EnableEnd2End)
+    Opts.enableEnd2End();
+
+  if (EnableDefaultOpts)
+    Opts.enableDefaultOpts();
+
+  for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst,
+                                              PassPhase::PassPhaseLast)) {
+    std::optional<bool> Enabled = PassEnabled[static_cast<size_t>(P)];
+
+    // Apply only if set explicitly.
+    if (Enabled.has_value())
+      Opts.setPhaseEnabled(P, *Enabled);
+  }
+
+  if (StopAfter != PassPhase::None)
+    Opts.disableAfter(StopAfter);
+
+  if (Error CheckResult = Opts.checkConsistency())
+    return CheckResult;
+
+  return Opts;
+}
+
+static llvm::Expected<PollyPassOptions>
+parsePollyDefaultOptions(StringRef Params) {
+  return parsePollyOptions(Params, false);
+}
+
+static llvm::Expected<PollyPassOptions>
+parsePollyCustomOptions(StringRef Params) {
+  return parsePollyOptions(Params, true);
+}
+
 /// Register Polly passes such that they form a polyhedral optimizer.
 ///
 /// The individual Polly passes are registered in the pass manager such that
@@ -305,77 +461,12 @@ static void buildCommonPollyPipeline(FunctionPassManager &PM,
                                      OptimizationLevel Level,
                                      bool EnableForOpt) {
   PassBuilder PB;
-  ScopPassManager SPM;
-
-  PM.addPass(CodePreparationPass());
-
-  // TODO add utility passes for the various command line options, once they're
-  // ported
-
-  if (PollyDetectOnly) {
-    // Don't add more passes other than the ScopPassManager's detection passes.
-    PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM)));
-    return;
-  }
 
-  if (PollyViewer)
-    PM.addPass(ScopViewer());
-  if (PollyOnlyViewer)
-    PM.addPass(ScopOnlyViewer());
-  if (PollyPrinter)
-    PM.addPass(ScopPrinter());
-  if (PollyOnlyPrinter)
-    PM.addPass(ScopOnlyPrinter());
-  if (EnableSimplify)
-    SPM.addPass(SimplifyPass(0));
-  if (EnableForwardOpTree)
-    SPM.addPass(ForwardOpTreePass());
-  if (EnableDeLICM)
-    SPM.addPass(DeLICMPass());
-  if (EnableSimplify)
-    SPM.addPass(SimplifyPass(1));
-
-  if (ImportJScop)
-    SPM.addPass(JSONImportPass());
-
-  if (DeadCodeElim)
-    SPM.addPass(DeadCodeElimPass());
-
-  if (FullyIndexedStaticExpansion)
-    SPM.addPass(MaximalStaticExpansionPass());
-
-  if (EnablePruneUnprofitable)
-    SPM.addPass(PruneUnprofitablePass());
-
-  switch (Optimizer) {
-  case OPTIMIZER_NONE:
-    break; /* Do nothing */
-  case OPTIMIZER_ISL:
-    SPM.addPass(IslScheduleOptimizerPass());
-    break;
-  }
-
-  if (ExportJScop)
-    SPM.addPass(JSONExportPass());
-
-  if (!EnableForOpt)
-    return;
-
-  switch (CodeGeneration) {
-  case CODEGEN_AST:
-    SPM.addPass(
-        llvm::RequireAnalysisPass<IslAstAnalysis, Scop, ScopAnalysisManager,
-                                  ScopStandardAnalysisResults &,
-                                  SPMUpdater &>());
-    break;
-  case CODEGEN_FULL:
-    SPM.addPass(CodeGenerationPass());
-    break;
-  case CODEGEN_NONE:
-    break;
-  }
+  ExitOnError Err("Inconsistent Polly configuration: ");
+  PollyPassOptions &&Opts =
+      Err(parsePollyOptions(StringRef(), /*IsCustom=*/false));
+  PM.addPass(PollyFunctionPass(Opts));
 
-  PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM)));
   PM.addPass(PB.buildFunctionSimplificationPipeline(
       Level, llvm::ThinOrFullLTOPhase::None)); // Cleanup
 
@@ -492,8 +583,9 @@ parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM,
   return false;
 }
 
-static bool
+static llvm::Expected<bool>
 parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM,
+                      PassInstrumentationCallbacks *PIC,
                       ArrayRef<PassBuilder::PipelineElement> Pipeline) {
   if (llvm::parseAnalysisUtilityPasses<OwningScopAnalysisManagerFunctionProxy>(
           "polly-scop-analyses", Name, FPM))
@@ -505,8 +597,13 @@ parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM,
                                                               FPM))            \
     return true;
 
-#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
-  if (Name == NAME) {                                                          \
+#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER)                               \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
+    auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME);  \
+    if (!ExpectedOpts)                                                         \
+      return ExpectedOpts.takeError();                                         \
+    auto &&Opts = *ExpectedOpts;                                               \
+    (void)Opts;                                                                \
     FPM.addPass(CREATE_PASS);                                                  \
     return true;                                                               \
   }
@@ -592,6 +689,28 @@ parseTopLevelPipeline(llvm::ModulePassManager &MPM,
   return true;
 }
 
+static llvm::Expected<bool>
+parseModulePipeline(StringRef Name, llvm::ModulePassManager &MPM,
+                    PassInstrumentationCallbacks *PIC,
+                    ArrayRef<PassBuilder::PipelineElement> Pipeline) {
+  assert(Pipeline.empty());
+
+#define MODULE_PASS(NAME, CREATE_PASS, PARSER)                                 \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
+    auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME);  \
+    if (!ExpectedOpts)                                                         \
+      return ExpectedOpts.takeError();                                         \
+    auto &&Opts = *ExpectedOpts;                                               \
+    (void)Opts;                                                                \
+    MPM.addPass(CREATE_PASS);                                                  \
+    return true;                                                               \
+  }
+
+#include "PollyPasses.def"
+
+  return false;
+}
+
 /// Register Polly to be available as an optimizer
 ///
 ///
@@ -620,10 +739,36 @@ parseTopLevelPipeline(llvm::ModulePassManager &MPM,
 /// handle LICMed code to make it useful.
 void registerPollyPasses(PassBuilder &PB) {
   PassInstrumentationCallbacks *PIC = PB.getPassInstrumentationCallbacks();
+
+#define MODULE_PASS(NAME, CREATE_PASS, PARSER)                                 \
+  {                                                                            \
+    std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts;              \
+    (void)Opts;                                                                \
+    PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);              \
+  }
+#define CGSCC_PASS(NAME, CREATE_PASS, PARSER)                                  \
+  {                                                                            \
+    std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts;              \
+    (void)Opts;                                                                \
+    PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);              \
+  }
+#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER)                               \
+  {                                                                            \
+    std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts;              \
+    (void)Opts;                                                                \
+    PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);              \
+  }
+#include "PollyPasses.def"
+
   PB.registerAnalysisRegistrationCallback([PIC](FunctionAnalysisManager &FAM) {
     registerFunctionAnalyses(FAM, PIC);
   });
-  PB.registerPipelineParsingCallback(parseFunctionPipeline);
+  PB.registerPipelineParsingCallback(
+      [PIC](StringRef Name, FunctionPassManager &FPM,
+            ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
+        ExitOnError Err("Unable to parse Polly module pass: ");
+        return Err(parseFunctionPipeline(Name, FPM, PIC, Pipeline));
+      });
   PB.registerPipelineParsingCallback(
       [PIC](StringRef Name, FunctionPassManager &FPM,
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
@@ -635,6 +780,12 @@ void registerPollyPasses(PassBuilder &PB) {
         ExitOnError Err("Unable to parse Polly call graph pass: ");
         return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline));
       });
+  PB.registerPipelineParsingCallback(
+      [PIC](StringRef Name, ModulePassManager &MPM,
+            ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
+        ExitOnError Err("Unable to parse Polly module pass: ");
+        return Err(parseModulePipeline(Name, MPM, PIC, Pipeline));
+      });
   PB.registerParseTopLevelPipelineCallback(
       [PIC](llvm::ModulePassManager &MPM,
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index a2328d1bbb3cf..cf0ec4432f747 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -206,18 +206,6 @@ void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, DominatorTree *DT,
   splitBlock(EntryBlock, I, DT, LI, RI);
 }
 
-void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, Pass *P) {
-  auto *DTWP = P->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  auto *LIWP = P->getAnalysisIfAvailable<LoopInfoWrapperPass>();
-  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-  RegionInfoPass *RIP = P->getAnalysisIfAvailable<RegionInfoPass>();
-  RegionInfo *RI = RIP ? &RIP->getRegionInfo() : nullptr;
-
-  // splitBlock updates DT, LI and RI.
-  polly::splitEntryBlockForAlloca(EntryBlock, DT, LI, RI);
-}
-
 void polly::recordAssumption(polly::RecordedAssumptionsTy *RecordedAssumptions,
                              polly::AssumptionKind Kind, isl::set Set,
                              DebugLoc Loc, polly::AssumptionSign Sign,
diff --git a/polly/lib/Transform/Canonicalization.cpp b/polly/lib/Transform/Canonicalization.cpp
index 1be560e64af40..cd7195f5374df 100644
--- a/polly/lib/Transform/Canonicalization.cpp
+++ b/polly/lib/Transform/Canonicalization.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/Canonicalization.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -39,24 +38,6 @@ static cl::opt<bool>
                  cl::desc("Run an early inliner pass before Polly"), cl::Hidden,
                  cl::cat(PollyCategory));
 
-void polly::registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM) {
-  bool UseMemSSA = true;
-  PM.add(llvm::createPromoteMemoryToRegisterPass());
-  PM.add(llvm::createEarlyCSEPass(UseMemSSA));
-  PM.add(llvm::createInstructionCombiningPass());
-  PM.add(llvm::createCFGSimplificationPass());
-  PM.add(llvm::createTailCallEliminationPass());
-  PM.add(llvm::createCFGSimplificationPass());
-  PM.add(llvm::createReassociatePass());
-  if (PollyInliner) {
-    PM.add(llvm::createPromoteMemoryToRegisterPass());
-    PM.add(llvm::createCFGSimplificationPass());
-    PM.add(llvm::createInstructionCombiningPass());
-    PM.add(createBarrierNoopPass());
-  }
-  PM.add(llvm::createInstructionCombiningPass());
-}
-
 /// Adapted from llvm::PassBuilder::buildInlinerPipeline
 static ModuleInlinerWrapperPass
 buildInlinePasses(llvm::OptimizationLevel Level) {
@@ -125,49 +106,3 @@ polly::buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM,
 
   return FPM;
 }
-
-namespace {
-class PollyCanonicalize final : public ModulePass {
-  PollyCanonicalize(const PollyCanonicalize &) = delete;
-  const PollyCanonicalize &operator=(const PollyCanonicalize &) = delete;
-
-public:
-  static char ID;
-
-  explicit PollyCanonicalize() : ModulePass(ID) {}
-  ~PollyCanonicalize();
-
-  /// @name FunctionPass interface.
-  //@{
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
-  bool runOnModule(Module &M) override;
-  void print(raw_ostream &OS, const Module *) const override;
-  //@}
-};
-} // namespace
-
-PollyCanonicalize::~PollyCanonicalize() {}
-
-void PollyCanonicalize::getAnalysisUsage(AnalysisUsage &AU) const {}
-
-void PollyCanonicalize::releaseMemory() {}
-
-bool PollyCanonicalize::runOnModule(Module &M) {
-  legacy::PassManager PM;
-  registerCanonicalicationPasses(PM);
-  PM.run(M);
-
-  return true;
-}
-
-void PollyCanonicalize::print(raw_ostream &OS, const Module *) const {}
-
-char PollyCanonicalize::ID = 0;
-
-Pass *polly::createPollyCanonicalizePass() { return new PollyCanonicalize(); }
-
-INITIALIZE_PASS_BEGIN(PollyCanonicalize, "polly-canonicalize",
-                      "Polly - Run canonicalization passes", false, false)
-INITIALIZE_PASS_END(PollyCanonicalize, "polly-canonicalize",
-                    "Polly - Run canonicalization passes", false, false)
diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp
index d045fb6b62c90..5b96c865ad80f 100644
--- a/polly/lib/Transform/CodePreparation.cpp
+++ b/polly/lib/Transform/CodePreparation.cpp
@@ -16,13 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/CodePreparation.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Support/ScopHelper.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 using namespace polly;
@@ -47,32 +45,6 @@ static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI,
   return true;
 }
 
-namespace {
-
-/// Prepare the IR for the scop detection.
-///
-class CodePreparation final : public FunctionPass {
-  CodePreparation(const CodePreparation &) = delete;
-  const CodePreparation &operator=(const CodePreparation &) = delete;
-
-  void clear();
-
-public:
-  static char ID;
-
-  explicit CodePreparation() : FunctionPass(ID) {}
-  ~CodePreparation();
-
-  /// @name FunctionPass interface.
-  //@{
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
-  bool runOnFunction(Function &F) override;
-  void print(raw_ostream &OS, const Module *) const override;
-  //@}
-};
-} // namespace
-
 PreservedAnalyses CodePreparationPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
@@ -86,44 +58,3 @@ PreservedAnalyses CodePreparationPass::run(Function &F,
   PA.preserve<LoopAnalysis>();
   return PA;
 }
-
-void CodePreparation::clear() {}
-
-CodePreparation::~CodePreparation() { clear(); }
-
-void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<LoopInfoWrapperPass>();
-
-  AU.addPreserved<LoopInfoWrapperPass>();
-  AU.addPreserved<RegionInfoPass>();
-  AU.addPreserved<DominatorTreeWrapperPass>();
-  AU.addPreserved<DominanceFrontierWrapperPass>();
-}
-
-bool CodePreparation::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  RegionInfo *RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
-
-  runCodePreprationImpl(F, DT, LI, RI);
-
-  return true;
-}
-
-void CodePreparation::releaseMemory() { clear(); }
-
-void CodePreparation::print(raw_ostream &OS, const Module *) const {}
-
-char CodePreparation::ID = 0;
-char &polly::CodePreparationID = CodePreparation::ID;
-
-Pass *polly::createCodePreparationPass() { return new CodePreparation(); }
-
-INITIALIZE_PASS_BEGIN(CodePreparation, "polly-prepare",
-                      "Polly - Prepare code for polly", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(CodePreparation, "polly-prepare",
-                    "Polly - Prepare code for polly", false, false)
diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp
index 9a9768afe113e..e8f2d951404f3 100644
--- a/polly/lib/Transform/DeLICM.cpp
+++ b/polly/lib/Transform/DeLICM.cpp
@@ -15,7 +15,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/DeLICM.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
@@ -25,7 +24,6 @@
 #include "polly/ZoneAlgo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
 
 #include "polly/Support/PollyDebug.h"
 #define DEBUG_TYPE "polly-delicm"
@@ -35,6 +33,10 @@ using namespace llvm;
 
 namespace {
 
+static cl::opt<bool> PollyPrintDeLICM("polly-print-delicm",
+                                      cl::desc("Polly - Print DeLICM/DePRE"),
+                                      cl::cat(PollyCategory));
+
 cl::opt<int>
     DelicmMaxOps("polly-delicm-max-ops",
                  cl::desc("Maximum number of isl operations to invest for "
@@ -1356,7 +1358,10 @@ class DeLICMImpl final : public ZoneAlgorithm {
   }
 
   /// Return whether at least one transformation been applied.
-  bool isModified() const { return NumberOfTargetsMapped > 0; }
+  bool isModified() const {
+    return NumberOfTargetsMapped > 0 || NumberOfMappedValueScalars > 0 ||
+           NumberOfMappedPHIScalars > 0;
+  }
 };
 
 static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) {
@@ -1376,7 +1381,7 @@ static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) {
   return Impl;
 }
 
-static std::unique_ptr<DeLICMImpl> runDeLICM(Scop &S, LoopInfo &LI) {
+static std::unique_ptr<DeLICMImpl> runDeLICMImpl(Scop &S, LoopInfo &LI) {
   std::unique_ptr<DeLICMImpl> Impl = collapseToUnused(S, LI);
 
   Scop::ScopStatistics ScopStats = S.getStatistics();
@@ -1394,7 +1399,7 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                                            ScopStandardAnalysisResults &SAR,
                                            SPMUpdater &U, raw_ostream *OS) {
   LoopInfo &LI = SAR.LI;
-  std::unique_ptr<DeLICMImpl> Impl = runDeLICM(S, LI);
+  std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI);
 
   if (OS) {
     *OS << "Printing analysis 'Polly - DeLICM/DePRE' for region: '"
@@ -1417,88 +1422,8 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   PA.preserveSet<AllAnalysesOn<Loop>>();
   return PA;
 }
-
-class DeLICMWrapperPass final : public ScopPass {
-private:
-  DeLICMWrapperPass(const DeLICMWrapperPass &) = delete;
-  const DeLICMWrapperPass &operator=(const DeLICMWrapperPass &) = delete;
-
-  /// The pass implementation, also holding per-scop data.
-  std::unique_ptr<DeLICMImpl> Impl;
-
-public:
-  static char ID;
-  explicit DeLICMWrapperPass() : ScopPass(ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredTransitive<ScopInfoRegionPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-  bool runOnScop(Scop &S) override {
-    // Free resources for previous scop's computation, if not yet done.
-    releaseMemory();
-
-    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    Impl = runDeLICM(S, LI);
-
-    return Impl->isModified();
-  }
-
-  void printScop(raw_ostream &OS, Scop &S) const override {
-    if (!Impl)
-      return;
-    assert(Impl->getScop() == &S);
-
-    OS << "DeLICM result:\n";
-    Impl->print(OS);
-  }
-
-  void releaseMemory() override { Impl.reset(); }
-};
-
-char DeLICMWrapperPass::ID;
-
-/// Print result from DeLICMWrapperPass.
-class DeLICMPrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()) {}
-  explicit DeLICMPrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    DeLICMWrapperPass &P = getAnalysis<DeLICMWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<DeLICMWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char DeLICMPrinterLegacyPass::ID = 0;
 } // anonymous namespace
 
-Pass *polly::createDeLICMWrapperPass() { return new DeLICMWrapperPass(); }
-
-llvm::Pass *polly::createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS) {
-  return new DeLICMPrinterLegacyPass(OS);
-}
-
 llvm::PreservedAnalyses polly::DeLICMPass::run(Scop &S,
                                                ScopAnalysisManager &SAM,
                                                ScopStandardAnalysisResults &SAR,
@@ -1527,15 +1452,21 @@ bool polly::isConflicting(
   return Knowledge::isConflicting(Existing, Proposed, OS, Indent);
 }
 
-INITIALIZE_PASS_BEGIN(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE",
-                    false, false)
-
-INITIALIZE_PASS_BEGIN(DeLICMPrinterLegacyPass, "polly-print-delicm",
-                      "Polly - Print DeLICM/DePRE", false, false)
-INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass)
-INITIALIZE_PASS_END(DeLICMPrinterLegacyPass, "polly-print-delicm",
-                    "Polly - Print DeLICM/DePRE", false, false)
+bool polly::runDeLICM(Scop &S) {
+  LoopInfo &LI = *S.getLI();
+  std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI);
+
+  if (PollyPrintDeLICM) {
+    outs() << "Printing analysis 'Polly - DeLICM/DePRE' for region: '"
+           << S.getName() << "' in function '" << S.getFunction().getName()
+           << "':\n";
+    if (Impl) {
+      assert(Impl->getScop() == &S);
+
+      outs() << "DeLICM result:\n";
+      Impl->print(outs());
+    }
+  }
+
+  return Impl->isModified();
+}
diff --git a/polly/lib/Transform/DeadCodeElimination.cpp b/polly/lib/Transform/DeadCodeElimination.cpp
index 5cb89fec09fe8..df95e5190431c 100644
--- a/polly/lib/Transform/DeadCodeElimination.cpp
+++ b/polly/lib/Transform/DeadCodeElimination.cpp
@@ -33,7 +33,6 @@
 
 #include "polly/DeadCodeElimination.h"
 #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -51,20 +50,6 @@ cl::opt<int> DCEPreciseSteps(
              "before the actual dead code elimination."),
     cl::init(-1), cl::cat(PollyCategory));
 
-class DeadCodeElimWrapperPass final : public ScopPass {
-public:
-  static char ID;
-  explicit DeadCodeElimWrapperPass() : ScopPass(ID) {}
-
-  /// Remove dead iterations from the schedule of @p S.
-  bool runOnScop(Scop &S) override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-char DeadCodeElimWrapperPass::ID = 0;
-
 /// Return the set of live iterations.
 ///
 /// The set of live iterations are all iterations that write to memory and for
@@ -144,29 +129,19 @@ static bool runDeadCodeElimination(Scop &S, int PreciseSteps,
   return S.restrictDomains(Live);
 }
 
-bool DeadCodeElimWrapperPass::runOnScop(Scop &S) {
-  auto &DI = getAnalysis<DependenceInfo>();
-  const Dependences &Deps = DI.getDependences(Dependences::AL_Statement);
+} // namespace
+
+bool polly::runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA) {
+  const Dependences &Deps = DA.getDependences(Dependences::AL_Statement);
 
   bool Changed = runDeadCodeElimination(S, DCEPreciseSteps, Deps);
 
   // FIXME: We can probably avoid the recomputation of all dependences by
   // updating them explicitly.
   if (Changed)
-    DI.recomputeDependences(Dependences::AL_Statement);
-
-  return false;
-}
-
-void DeadCodeElimWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  ScopPass::getAnalysisUsage(AU);
-  AU.addRequired<DependenceInfo>();
-}
-
-} // namespace
+    DA.recomputeDependences(Dependences::AL_Statement);
 
-Pass *polly::createDeadCodeElimWrapperPass() {
-  return new DeadCodeElimWrapperPass();
+  return Changed;
 }
 
 llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM,
@@ -191,10 +166,3 @@ llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM,
   PA.preserveSet<AllAnalysesOn<Loop>>();
   return PA;
 }
-
-INITIALIZE_PASS_BEGIN(DeadCodeElimWrapperPass, "polly-dce",
-                      "Polly - Remove dead iterations", false, false)
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo)
-INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass)
-INITIALIZE_PASS_END(DeadCodeElimWrapperPass, "polly-dce",
-                    "Polly - Remove dead iterations", false, false)
diff --git a/polly/lib/Transform/FlattenSchedule.cpp b/polly/lib/Transform/FlattenSchedule.cpp
index f514ef359ba07..35a8ce6877036 100644
--- a/polly/lib/Transform/FlattenSchedule.cpp
+++ b/polly/lib/Transform/FlattenSchedule.cpp
@@ -14,6 +14,7 @@
 
 #include "polly/FlattenSchedule.h"
 #include "polly/FlattenAlgo.h"
+#include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Support/ISLOStream.h"
@@ -26,6 +27,10 @@ using namespace llvm;
 
 namespace {
 
+static cl::opt<bool> PollyPrintFlattenSchedule("polly-print-flatten-schedule",
+                                               cl::desc("A polly pass"),
+                                               cl::cat(PollyCategory));
+
 /// Print a schedule to @p OS.
 ///
 /// Prints the schedule for each statements on a new line.
@@ -34,119 +39,45 @@ void printSchedule(raw_ostream &OS, const isl::union_map &Schedule,
   for (isl::map Map : Schedule.get_map_list())
     OS.indent(indent) << Map << "\n";
 }
+} // namespace
 
-/// Flatten the schedule stored in an polly::Scop.
-class FlattenSchedule final : public ScopPass {
-private:
-  FlattenSchedule(const FlattenSchedule &) = delete;
-  const FlattenSchedule &operator=(const FlattenSchedule &) = delete;
-
-  std::shared_ptr<isl_ctx> IslCtx;
-  isl::union_map OldSchedule;
-
-public:
-  static char ID;
-  explicit FlattenSchedule() : ScopPass(ID) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredTransitive<ScopInfoRegionPass>();
-    AU.setPreservesAll();
-  }
-
-  bool runOnScop(Scop &S) override {
-    // Keep a reference to isl_ctx to ensure that it is not freed before we free
-    // OldSchedule.
-    IslCtx = S.getSharedIslCtx();
+void polly::runFlattenSchedulePass(Scop &S) {
+  // Keep a reference to isl_ctx to ensure that it is not freed before we free
+  // OldSchedule.
+  auto IslCtx = S.getSharedIslCtx();
 
-    POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n");
-    OldSchedule = S.getSchedule();
-    POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2));
+  POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n");
+  auto OldSchedule = S.getSchedule();
+  POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2));
 
-    auto Domains = S.getDomains();
-    auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains);
-    POLLY_DEBUG(dbgs() << "Old schedule with domains:\n");
-    POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2));
+  auto Domains = S.getDomains();
+  auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains);
+  POLLY_DEBUG(dbgs() << "Old schedule with domains:\n");
+  POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2));
 
-    auto NewSchedule = flattenSchedule(RestrictedOldSchedule);
+  auto NewSchedule = flattenSchedule(RestrictedOldSchedule);
 
-    POLLY_DEBUG(dbgs() << "Flattened new schedule:\n");
-    POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
+  POLLY_DEBUG(dbgs() << "Flattened new schedule:\n");
+  POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
 
-    NewSchedule = NewSchedule.gist_domain(Domains);
-    POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n");
-    POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
+  NewSchedule = NewSchedule.gist_domain(Domains);
+  POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n");
+  POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
 
-    S.setSchedule(NewSchedule);
-    return false;
-  }
+  S.setSchedule(NewSchedule);
 
-  void printScop(raw_ostream &OS, Scop &S) const override {
-    OS << "Schedule before flattening {\n";
-    printSchedule(OS, OldSchedule, 4);
-    OS << "}\n\n";
+  if (PollyPrintFlattenSchedule) {
+    outs()
+        << "Printing analysis 'Polly - Print flattened schedule' for region: '"
+        << S.getRegion().getNameStr() << "' in function '"
+        << S.getFunction().getName() << "':\n";
 
-    OS << "Schedule after flattening {\n";
-    printSchedule(OS, S.getSchedule(), 4);
-    OS << "}\n";
-  }
+    outs() << "Schedule before flattening {\n";
+    printSchedule(outs(), OldSchedule, 4);
+    outs() << "}\n\n";
 
-  void releaseMemory() override {
-    OldSchedule = {};
-    IslCtx.reset();
+    outs() << "Schedule after flattening {\n";
+    printSchedule(outs(), S.getSchedule(), 4);
+    outs() << "}\n";
   }
-};
-
-char FlattenSchedule::ID;
-
-/// Print result from FlattenSchedule.
-class FlattenSchedulePrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  FlattenSchedulePrinterLegacyPass()
-      : FlattenSchedulePrinterLegacyPass(outs()) {}
-  explicit FlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    FlattenSchedule &P = getAnalysis<FlattenSchedule>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<FlattenSchedule>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char FlattenSchedulePrinterLegacyPass::ID = 0;
-} // anonymous namespace
-
-Pass *polly::createFlattenSchedulePass() { return new FlattenSchedule(); }
-
-Pass *polly::createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) {
-  return new FlattenSchedulePrinterLegacyPass(OS);
 }
-
-INITIALIZE_PASS_BEGIN(FlattenSchedule, "polly-flatten-schedule",
-                      "Polly - Flatten schedule", false, false)
-INITIALIZE_PASS_END(FlattenSchedule, "polly-flatten-schedule",
-                    "Polly - Flatten schedule", false, false)
-
-INITIALIZE_PASS_BEGIN(FlattenSchedulePrinterLegacyPass,
-                      "polly-print-flatten-schedule",
-                      "Polly - Print flattened schedule", false, false)
-INITIALIZE_PASS_DEPENDENCY(FlattenSchedule)
-INITIALIZE_PASS_END(FlattenSchedulePrinterLegacyPass,
-                    "polly-print-flatten-schedule",
-                    "Polly - Print flattened schedule", false, false)
diff --git a/polly/lib/Transform/ForwardOpTree.cpp b/polly/lib/Transform/ForwardOpTree.cpp
index e9be6c9cdcc27..24d4a4af6e681 100644
--- a/polly/lib/Transform/ForwardOpTree.cpp
+++ b/polly/lib/Transform/ForwardOpTree.cpp
@@ -28,7 +28,6 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -62,6 +61,11 @@ static cl::opt<unsigned>
                     "analysis; 0=no limit"),
            cl::init(1000000), cl::cat(PollyCategory), cl::Hidden);
 
+static cl::opt<bool>
+    PollyPrintOptree("polly-print-optree",
+                     cl::desc("Polly - Print forward operand tree result"),
+                     cl::cat(PollyCategory));
+
 STATISTIC(KnownAnalyzed, "Number of successfully analyzed SCoPs");
 STATISTIC(KnownOutOfQuota,
           "Analyses aborted because max_operations was reached");
@@ -1030,8 +1034,8 @@ class ForwardOpTreeImpl final : ZoneAlgorithm {
   bool isModified() const { return Modified; }
 };
 
-static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTree(Scop &S,
-                                                           LoopInfo &LI) {
+static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTreeImpl(Scop &S,
+                                                               LoopInfo &LI) {
   std::unique_ptr<ForwardOpTreeImpl> Impl;
   {
     IslMaxOperationsGuard MaxOpGuard(S.getIslCtx().get(), MaxOps, false);
@@ -1073,7 +1077,7 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                          raw_ostream *OS) {
   LoopInfo &LI = SAR.LI;
 
-  std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTree(S, LI);
+  std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI);
   if (OS) {
     *OS << "Printing analysis 'Polly - Forward operand tree' for region: '"
         << S.getName() << "' in function '" << S.getFunction().getName()
@@ -1094,99 +1098,8 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   PA.preserveSet<AllAnalysesOn<Loop>>();
   return PA;
 }
-
-/// Pass that redirects scalar reads to array elements that are known to contain
-/// the same value.
-///
-/// This reduces the number of scalar accesses and therefore potentially
-/// increases the freedom of the scheduler. In the ideal case, all reads of a
-/// scalar definition are redirected (We currently do not care about removing
-/// the write in this case).  This is also useful for the main DeLICM pass as
-/// there are less scalars to be mapped.
-class ForwardOpTreeWrapperPass final : public ScopPass {
-private:
-  /// The pass implementation, also holding per-scop data.
-  std::unique_ptr<ForwardOpTreeImpl> Impl;
-
-public:
-  static char ID;
-
-  explicit ForwardOpTreeWrapperPass() : ScopPass(ID) {}
-  ForwardOpTreeWrapperPass(const ForwardOpTreeWrapperPass &) = delete;
-  ForwardOpTreeWrapperPass &
-  operator=(const ForwardOpTreeWrapperPass &) = delete;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredTransitive<ScopInfoRegionPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-  bool runOnScop(Scop &S) override {
-    // Free resources for previous SCoP's computation, if not yet done.
-    releaseMemory();
-
-    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-
-    Impl = runForwardOpTree(S, LI);
-
-    return false;
-  }
-
-  void printScop(raw_ostream &OS, Scop &S) const override {
-    if (!Impl)
-      return;
-
-    assert(Impl->getScop() == &S);
-    Impl->print(OS);
-  }
-
-  void releaseMemory() override { Impl.reset(); }
-}; // class ForwardOpTree
-
-char ForwardOpTreeWrapperPass::ID;
-
-/// Print result from ForwardOpTreeWrapperPass.
-class ForwardOpTreePrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()) {}
-  explicit ForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    ForwardOpTreeWrapperPass &P = getAnalysis<ForwardOpTreeWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<ForwardOpTreeWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char ForwardOpTreePrinterLegacyPass::ID = 0;
 } // namespace
 
-Pass *polly::createForwardOpTreeWrapperPass() {
-  return new ForwardOpTreeWrapperPass();
-}
-
-Pass *polly::createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) {
-  return new ForwardOpTreePrinterLegacyPass(OS);
-}
-
 llvm::PreservedAnalyses ForwardOpTreePass::run(Scop &S,
                                                ScopAnalysisManager &SAM,
                                                ScopStandardAnalysisResults &SAR,
@@ -1200,14 +1113,20 @@ ForwardOpTreePrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return runForwardOpTreeUsingNPM(S, SAM, SAR, U, &OS);
 }
 
-INITIALIZE_PASS_BEGIN(ForwardOpTreeWrapperPass, "polly-optree",
-                      "Polly - Forward operand tree", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(ForwardOpTreeWrapperPass, "polly-optree",
-                    "Polly - Forward operand tree", false, false)
-
-INITIALIZE_PASS_BEGIN(ForwardOpTreePrinterLegacyPass, "polly-print-optree",
-                      "Polly - Print forward operand tree result", false, false)
-INITIALIZE_PASS_DEPENDENCY(ForwardOpTreeWrapperPass)
-INITIALIZE_PASS_END(ForwardOpTreePrinterLegacyPass, "polly-print-optree",
-                    "Polly - Print forward operand tree result", false, false)
+bool polly::runForwardOpTree(Scop &S) {
+  LoopInfo &LI = *S.getLI();
+
+  std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI);
+  if (PollyPrintOptree) {
+    outs() << "Printing analysis 'Polly - Forward operand tree' for region: '"
+           << S.getName() << "' in function '" << S.getFunction().getName()
+           << "':\n";
+    if (Impl) {
+      assert(Impl->getScop() == &S);
+
+      Impl->print(outs());
+    }
+  }
+
+  return Impl->isModified();
+}
diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp
index 0719840f74a79..62a4d251875c5 100644
--- a/polly/lib/Transform/MaximalStaticExpansion.cpp
+++ b/polly/lib/Transform/MaximalStaticExpansion.cpp
@@ -13,14 +13,13 @@
 
 #include "polly/MaximalStaticExpansion.h"
 #include "polly/DependenceInfo.h"
-#include "polly/LinkAllPasses.h"
+#include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Support/ISLTools.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/InitializePasses.h"
 #include "isl/isl-noexceptions.h"
 #include "isl/union_map.h"
 #include <cassert>
@@ -35,28 +34,10 @@ using namespace polly;
 
 namespace {
 
-class MaximalStaticExpanderWrapperPass final : public ScopPass {
-public:
-  static char ID;
-
-  explicit MaximalStaticExpanderWrapperPass() : ScopPass(ID) {}
-
-  ~MaximalStaticExpanderWrapperPass() override = default;
-
-  /// Expand the accesses of the SCoP.
-  ///
-  /// @param S The SCoP that must be expanded.
-  bool runOnScop(Scop &S) override;
-
-  /// Print the SCoP.
-  ///
-  /// @param OS The stream where to print.
-  /// @param S The SCop that must be printed.
-  void printScop(raw_ostream &OS, Scop &S) const override;
-
-  /// Register all analyses and transformations required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
+static cl::opt<bool>
+    PollyPrintMSE("polly-print-mse",
+                  cl::desc("Polly - Print Maximal static expansion of SCoP"),
+                  cl::cat(PollyCategory));
 
 #ifndef NDEBUG
 /// Whether a dimension of a set is bounded (lower and upper) by a constant,
@@ -458,8 +439,8 @@ class MaximalStaticExpansionImpl {
 };
 
 static std::unique_ptr<MaximalStaticExpansionImpl>
-runMaximalStaticExpansion(Scop &S, OptimizationRemarkEmitter &ORE,
-                          const Dependences &D) {
+runMaximalStaticExpansionImpl(Scop &S, OptimizationRemarkEmitter &ORE,
+                              const Dependences &D) {
   auto Dependences = D.getDependences(Dependences::TYPE_RAW);
 
   std::unique_ptr<MaximalStaticExpansionImpl> Impl =
@@ -478,7 +459,7 @@ static PreservedAnalyses runMSEUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   auto &D = DI.getDependences(Dependences::AL_Reference);
 
   std::unique_ptr<MaximalStaticExpansionImpl> Impl =
-      runMaximalStaticExpansion(S, ORE, D);
+      runMaximalStaticExpansionImpl(S, ORE, D);
 
   if (OS) {
     *OS << "Printing analysis 'Polly - Maximal static expansion of SCoP' for "
@@ -511,42 +492,24 @@ MaximalStaticExpansionPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return runMSEUsingNPM(S, SAM, SAR, &OS);
 }
 
-char MaximalStaticExpanderWrapperPass::ID = 0;
-
-bool MaximalStaticExpanderWrapperPass::runOnScop(Scop &S) {
-  // Get the ORE from OptimizationRemarkEmitterWrapperPass.
-  OptimizationRemarkEmitter *ORE =
-      &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+void polly::runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI) {
+  OptimizationRemarkEmitter ORE(&S.getFunction());
 
-  // Get the RAW Dependences.
-  auto &DI = getAnalysis<DependenceInfo>();
   auto &D = DI.getDependences(Dependences::AL_Reference);
 
   std::unique_ptr<MaximalStaticExpansionImpl> Impl =
-      runMaximalStaticExpansion(S, *ORE, D);
+      runMaximalStaticExpansionImpl(S, ORE, D);
 
-  return false;
-}
-
-void MaximalStaticExpanderWrapperPass::printScop(raw_ostream &OS,
-                                                 Scop &S) const {
-  S.print(OS, false);
-}
-
-void MaximalStaticExpanderWrapperPass::getAnalysisUsage(
-    AnalysisUsage &AU) const {
-  ScopPass::getAnalysisUsage(AU);
-  AU.addRequired<DependenceInfo>();
-  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-}
+  if (PollyPrintMSE) {
+    outs()
+        << "Printing analysis 'Polly - Maximal static expansion of SCoP' for "
+           "region: '"
+        << S.getName() << "' in function '" << S.getFunction().getName()
+        << "':\n";
 
-Pass *polly::createMaximalStaticExpansionPass() {
-  return new MaximalStaticExpanderWrapperPass();
+    if (Impl) {
+      outs() << "MSE result:\n";
+      Impl->print(llvm::outs());
+    }
+  }
 }
-
-INITIALIZE_PASS_BEGIN(MaximalStaticExpanderWrapperPass, "polly-mse",
-                      "Polly - Maximal static expansion of SCoP", false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
-INITIALIZE_PASS_END(MaximalStaticExpanderWrapperPass, "polly-mse",
-                    "Polly - Maximal static expansion of SCoP", false, false)
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 0888ebd7a9362..6acdd6862c4cc 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -57,7 +57,6 @@
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "isl/options.h"
 
@@ -198,6 +197,10 @@ static cl::opt<bool> OptimizedScops(
              "transformations is applied on the schedule tree"),
     cl::cat(PollyCategory));
 
+static cl::opt<bool> PollyPrintOptIsl("polly-print-opt-isl",
+                                      cl::desc("A polly pass"),
+                                      cl::cat(PollyCategory));
+
 STATISTIC(ScopsProcessed, "Number of scops processed");
 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
 STATISTIC(ScopsOptimized, "Number of scops optimized");
@@ -638,34 +641,6 @@ bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
   return changed;
 }
 
-class IslScheduleOptimizerWrapperPass final : public ScopPass {
-public:
-  static char ID;
-
-  explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
-
-  /// Optimize the schedule of the SCoP @p S.
-  bool runOnScop(Scop &S) override;
-
-  /// Print the new schedule for the SCoP @p S.
-  void printScop(raw_ostream &OS, Scop &S) const override;
-
-  /// Register all analyses and transformation required.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-  /// Release the internal memory.
-  void releaseMemory() override {
-    LastSchedule = {};
-    IslCtx.reset();
-  }
-
-private:
-  std::shared_ptr<isl_ctx> IslCtx;
-  isl::schedule LastSchedule;
-};
-
-char IslScheduleOptimizerWrapperPass::ID = 0;
-
 #ifndef NDEBUG
 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
                           StringRef Desc) {
@@ -733,7 +708,7 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
       &Version);
 }
 
-static void runIslScheduleOptimizer(
+static void runIslScheduleOptimizerImpl(
     Scop &S,
     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
@@ -950,30 +925,6 @@ static void runIslScheduleOptimizer(
     errs() << S;
 }
 
-bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
-  releaseMemory();
-
-  Function &F = S.getFunction();
-  IslCtx = S.getSharedIslCtx();
-
-  auto getDependences =
-      [this](Dependences::AnalysisLevel) -> const Dependences & {
-    return getAnalysis<DependenceInfo>().getDependences(
-        Dependences::AL_Statement);
-  };
-  OptimizationRemarkEmitter &ORE =
-      getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-  TargetTransformInfo *TTI =
-      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
-  bool DepsChanged = false;
-  runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
-                          DepsChanged);
-  if (DepsChanged)
-    getAnalysis<DependenceInfo>().abandonDependences();
-  return false;
-}
-
 static void runScheduleOptimizerPrinter(raw_ostream &OS,
                                         isl::schedule LastSchedule) {
   isl_printer *p;
@@ -997,36 +948,8 @@ static void runScheduleOptimizerPrinter(raw_ostream &OS,
   free(ScheduleStr);
 }
 
-void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
-  runScheduleOptimizerPrinter(OS, LastSchedule);
-}
-
-void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
-    AnalysisUsage &AU) const {
-  ScopPass::getAnalysisUsage(AU);
-  AU.addRequired<DependenceInfo>();
-  AU.addRequired<TargetTransformInfoWrapperPass>();
-  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
-
-  AU.addPreserved<DependenceInfo>();
-  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
-}
-
 } // namespace
 
-Pass *polly::createIslScheduleOptimizerWrapperPass() {
-  return new IslScheduleOptimizerWrapperPass();
-}
-
-INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
-                      "Polly - Optimize schedule of SCoP", false, false);
-INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
-INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
-                    "Polly - Optimize schedule of SCoP", false, false)
-
 static llvm::PreservedAnalyses
 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
@@ -1039,7 +962,7 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   TargetTransformInfo *TTI = &SAR.TTI;
   isl::schedule LastSchedule;
   bool DepsChanged = false;
-  runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
+  runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
   if (DepsChanged)
     Deps.abandonDependences();
 
@@ -1065,52 +988,23 @@ IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
 }
 
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from IslScheduleOptimizerWrapperPass.
-class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  IslScheduleOptimizerPrinterLegacyPass()
-      : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
-  explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    IslScheduleOptimizerWrapperPass &P =
-        getAnalysis<IslScheduleOptimizerWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
+void polly::runIslScheduleOptimizer(Scop &S, TargetTransformInfo *TTI,
+                                    DependenceAnalysis::Result &Deps) {
+  auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
+    return Deps.getDependences(Dependences::AL_Statement);
+  };
+  OptimizationRemarkEmitter ORE(&S.getFunction());
+  isl::schedule LastSchedule;
+  bool DepsChanged = false;
+  runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
+  if (DepsChanged)
+    Deps.abandonDependences();
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<IslScheduleOptimizerWrapperPass>();
-    AU.setPreservesAll();
+  if (PollyPrintOptIsl) {
+    outs()
+        << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
+        << S.getName() << "' in function '" << S.getFunction().getName()
+        << "':\n";
+    runScheduleOptimizerPrinter(outs(), LastSchedule);
   }
-
-private:
-  llvm::raw_ostream &OS;
-};
-
-char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
-  return new IslScheduleOptimizerPrinterLegacyPass(OS);
 }
-
-INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
-                      "polly-print-opt-isl",
-                      "Polly - Print optimizer schedule of SCoP", false, false);
-INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
-INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
-                    "polly-print-opt-isl",
-                    "Polly - Print optimizer schedule of SCoP", false, false)
diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp
index c04ba3498339e..8e7a0dedaf533 100644
--- a/polly/lib/Transform/ScopInliner.cpp
+++ b/polly/lib/Transform/ScopInliner.cpp
@@ -95,53 +95,7 @@ template <typename SCC_t> bool runScopInlinerImpl(Function *F, SCC_t &SCC) {
 
   return Changed;
 }
-
-class ScopInlinerWrapperPass final : public CallGraphSCCPass {
-  using llvm::Pass::doInitialization;
-
-public:
-  static char ID;
-
-  ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {}
-
-  bool doInitialization(CallGraph &CG) override {
-    if (!polly::PollyAllowFullFunction) {
-      report_fatal_error(
-          "Aborting from ScopInliner because it only makes sense to run with "
-          "-polly-allow-full-function. "
-          "The heurtistic for ScopInliner checks that the full function is a "
-          "Scop, which happens if and only if polly-allow-full-function is "
-          " enabled. "
-          " If not, the entry block is not included in the Scop");
-    }
-    return true;
-  }
-
-  bool runOnSCC(CallGraphSCC &SCC) override {
-    Function *F = (*SCC.begin())->getFunction();
-    return runScopInlinerImpl(F, SCC);
-  };
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    CallGraphSCCPass::getAnalysisUsage(AU);
-  }
-};
 } // namespace
-char ScopInlinerWrapperPass::ID;
-
-Pass *polly::createScopInlinerWrapperPass() {
-  ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass();
-  return pass;
-}
-
-INITIALIZE_PASS_BEGIN(
-    ScopInlinerWrapperPass, "polly-scop-inliner",
-    "inline functions based on how much of the function is a scop.", false,
-    false)
-INITIALIZE_PASS_END(
-    ScopInlinerWrapperPass, "polly-scop-inliner",
-    "inline functions based on how much of the function is a scop.", false,
-    false)
 
 polly::ScopInlinerPass::ScopInlinerPass() {
   if (!polly::PollyAllowFullFunction) {
diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp
index 75e91cd1c031a..cf0f8c5ca5ef2 100644
--- a/polly/lib/Transform/Simplify.cpp
+++ b/polly/lib/Transform/Simplify.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/Simplify.h"
+#include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Support/GICHelper.h"
@@ -18,7 +19,6 @@
 #include "polly/Support/ISLTools.h"
 #include "polly/Support/VirtualInstruction.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
@@ -30,6 +30,11 @@ using namespace polly;
 
 namespace {
 
+static cl::opt<bool>
+    PollyPrintSimplify("polly-print-simplify",
+                       cl::desc("Polly - Print Simplify actions"),
+                       cl::cat(PollyCategory));
+
 #define TWO_STATISTICS(VARNAME, DESC)                                          \
   static llvm::Statistic VARNAME[2] = {                                        \
       {DEBUG_TYPE, #VARNAME "0", DESC " (first)"},                             \
@@ -756,39 +761,6 @@ void SimplifyImpl::printScop(raw_ostream &OS, Scop &S) const {
   printAccesses(OS);
 }
 
-class SimplifyWrapperPass final : public ScopPass {
-public:
-  static char ID;
-  int CallNo;
-  std::optional<SimplifyImpl> Impl;
-
-  explicit SimplifyWrapperPass(int CallNo = 0) : ScopPass(ID), CallNo(CallNo) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequiredTransitive<ScopInfoRegionPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.setPreservesAll();
-  }
-
-  bool runOnScop(Scop &S) override {
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-
-    Impl.emplace(CallNo);
-    Impl->run(S, LI);
-
-    return false;
-  }
-
-  void printScop(raw_ostream &OS, Scop &S) const override {
-    if (Impl)
-      Impl->printScop(OS, S);
-  }
-
-  void releaseMemory() override { Impl.reset(); }
-};
-
-char SimplifyWrapperPass::ID;
-
 static llvm::PreservedAnalyses
 runSimplifyUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                     ScopStandardAnalysisResults &SAR, SPMUpdater &U, int CallNo,
@@ -843,58 +815,15 @@ SmallVector<MemoryAccess *, 32> polly::getAccessesInOrder(ScopStmt &Stmt) {
   return Accesses;
 }
 
-Pass *polly::createSimplifyWrapperPass(int CallNo) {
-  return new SimplifyWrapperPass(CallNo);
-}
-
-INITIALIZE_PASS_BEGIN(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify",
-                    false, false)
-
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// Print result from SimplifyWrapperPass.
-class SimplifyPrinterLegacyPass final : public ScopPass {
-public:
-  static char ID;
-
-  SimplifyPrinterLegacyPass() : SimplifyPrinterLegacyPass(outs()) {}
-  explicit SimplifyPrinterLegacyPass(llvm::raw_ostream &OS)
-      : ScopPass(ID), OS(OS) {}
-
-  bool runOnScop(Scop &S) override {
-    SimplifyWrapperPass &P = getAnalysis<SimplifyWrapperPass>();
-
-    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
-       << S.getRegion().getNameStr() << "' in function '"
-       << S.getFunction().getName() << "':\n";
-    P.printScop(OS, S);
-
-    return false;
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    ScopPass::getAnalysisUsage(AU);
-    AU.addRequired<SimplifyWrapperPass>();
-    AU.setPreservesAll();
+bool polly::runSimplify(Scop &S, int CallNo) {
+  SimplifyImpl Impl(CallNo);
+  Impl.run(S, S.getLI());
+  if (PollyPrintSimplify) {
+    outs() << "Printing analysis 'Polly - Simplify' for region: '"
+           << S.getName() << "' in function '" << S.getFunction().getName()
+           << "':\n";
+    Impl.printScop(outs(), S);
   }
 
-private:
-  llvm::raw_ostream &OS;
-};
-
-char SimplifyPrinterLegacyPass::ID = 0;
-} // namespace
-
-Pass *polly::createSimplifyPrinterLegacyPass(raw_ostream &OS) {
-  return new SimplifyPrinterLegacyPass(OS);
+  return Impl.isModified();
 }
-
-INITIALIZE_PASS_BEGIN(SimplifyPrinterLegacyPass, "polly-print-simplify",
-                      "Polly - Print Simplify actions", false, false)
-INITIALIZE_PASS_DEPENDENCY(SimplifyWrapperPass)
-INITIALIZE_PASS_END(SimplifyPrinterLegacyPass, "polly-print-simplify",
-                    "Polly - Print Simplify actions", false, false)
diff --git a/polly/test/CodeGen/20100617.ll b/polly/test/CodeGen/20100617.ll
index 7229a6e3d5240..7de1b843a5b0a 100644
--- a/polly/test/CodeGen/20100617.ll
+++ b/polly/test/CodeGen/20100617.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100622.ll b/polly/test/CodeGen/20100622.ll
index bed737741abba..13a6159d3e7a7 100644
--- a/polly/test/CodeGen/20100622.ll
+++ b/polly/test/CodeGen/20100622.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | not FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 
diff --git a/polly/test/CodeGen/20100707.ll b/polly/test/CodeGen/20100707.ll
index ee0422e07c4ea..6a4763dcb3b76 100644
--- a/polly/test/CodeGen/20100707.ll
+++ b/polly/test/CodeGen/20100707.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @clause_SetSplitField(i32 %Length) nounwind inlinehint {
diff --git a/polly/test/CodeGen/20100707_2.ll b/polly/test/CodeGen/20100707_2.ll
index a4cd76af9dd3c..648a06479ae27 100644
--- a/polly/test/CodeGen/20100707_2.ll
+++ b/polly/test/CodeGen/20100707_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @win193 = external global [4 x [36 x double]], align 32 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/20100708.ll b/polly/test/CodeGen/20100708.ll
index 9080451aeae50..52153d7cfa730 100644
--- a/polly/test/CodeGen/20100708.ll
+++ b/polly/test/CodeGen/20100708.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @execute() nounwind {
diff --git a/polly/test/CodeGen/20100708_2.ll b/polly/test/CodeGen/20100708_2.ll
index 51dc9d311f070..075a4947c8e72 100644
--- a/polly/test/CodeGen/20100708_2.ll
+++ b/polly/test/CodeGen/20100708_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100713.ll b/polly/test/CodeGen/20100713.ll
index a836795c9907f..0b0ed7327c8b1 100644
--- a/polly/test/CodeGen/20100713.ll
+++ b/polly/test/CodeGen/20100713.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @fft_float(i32 %NumSamples) nounwind {
diff --git a/polly/test/CodeGen/20100713_2.ll b/polly/test/CodeGen/20100713_2.ll
index 28b984bd5900f..5681f34152342 100644
--- a/polly/test/CodeGen/20100713_2.ll
+++ b/polly/test/CodeGen/20100713_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define hidden void @luaD_callhook() nounwind {
diff --git a/polly/test/CodeGen/20100717.ll b/polly/test/CodeGen/20100717.ll
index 51c453cfe438e..97ed151410dfb 100644
--- a/polly/test/CodeGen/20100717.ll
+++ b/polly/test/CodeGen/20100717.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly  -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @matrixTranspose(ptr %A) nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo-2.ll b/polly/test/CodeGen/20100718-DomInfo-2.ll
index fdac75f1b999f..cbee80e44949c 100644
--- a/polly/test/CodeGen/20100718-DomInfo-2.ll
+++ b/polly/test/CodeGen/20100718-DomInfo-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @getNonAffNeighbour() nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo.ll b/polly/test/CodeGen/20100718-DomInfo.ll
index da68eb0dd8fa7..e6fcaf6a9272f 100644
--- a/polly/test/CodeGen/20100718-DomInfo.ll
+++ b/polly/test/CodeGen/20100718-DomInfo.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @intrapred_luma_16x16(i32 %predmode) nounwind {
diff --git a/polly/test/CodeGen/20100720-MultipleConditions.ll b/polly/test/CodeGen/20100720-MultipleConditions.ll
index 3dece4efdcd06..66c9e2bb0eb5b 100644
--- a/polly/test/CodeGen/20100720-MultipleConditions.ll
+++ b/polly/test/CodeGen/20100720-MultipleConditions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s
 
 ;int bar1();
 ;int bar2();
diff --git a/polly/test/CodeGen/20100809-IndependentBlock.ll b/polly/test/CodeGen/20100809-IndependentBlock.ll
index f45b6544464de..cc3a5087090b4 100644
--- a/polly/test/CodeGen/20100809-IndependentBlock.ll
+++ b/polly/test/CodeGen/20100809-IndependentBlock.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @cfft2(ptr %x) nounwind {
 entry:
diff --git a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
index 82da9d2486423..240c2a49bc46d 100644
--- a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
+++ b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20101030-Overflow.ll b/polly/test/CodeGen/20101030-Overflow.ll
index fecdb9d4fed1e..c199f757ebac5 100644
--- a/polly/test/CodeGen/20101030-Overflow.ll
+++ b/polly/test/CodeGen/20101030-Overflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @compdecomp() nounwind {
diff --git a/polly/test/CodeGen/20101103-Overflow3.ll b/polly/test/CodeGen/20101103-Overflow3.ll
index f1503e25fcc4c..e8b425f009723 100644
--- a/polly/test/CodeGen/20101103-Overflow3.ll
+++ b/polly/test/CodeGen/20101103-Overflow3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @Reflection_coefficients(ptr %r) nounwind {
 bb20:
diff --git a/polly/test/CodeGen/20101103-signmissmatch.ll b/polly/test/CodeGen/20101103-signmissmatch.ll
index 3d0c929446f45..0295ee0567208 100644
--- a/polly/test/CodeGen/20101103-signmissmatch.ll
+++ b/polly/test/CodeGen/20101103-signmissmatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @CleanNet() nounwind {
diff --git a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
index 0e62e678f0ae2..6913deed23054 100644
--- a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
+++ b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @main() nounwind {
diff --git a/polly/test/CodeGen/20110226-PHI-Node-removed.ll b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
index 32b018f24e547..a39fced9dbaba 100644
--- a/polly/test/CodeGen/20110226-PHI-Node-removed.ll
+++ b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20120316-InvalidCast.ll b/polly/test/CodeGen/20120316-InvalidCast.ll
index b87a3dc60deaa..a7f709b4a7615 100644
--- a/polly/test/CodeGen/20120316-InvalidCast.ll
+++ b/polly/test/CodeGen/20120316-InvalidCast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
index dac78bf04a250..554384c0e777e 100644
--- a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
+++ b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 
 ; We just check that this compilation does not crash.
 
diff --git a/polly/test/CodeGen/20130221.ll b/polly/test/CodeGen/20130221.ll
index 5728a768a3b3b..101930e175634 100644
--- a/polly/test/CodeGen/20130221.ll
+++ b/polly/test/CodeGen/20130221.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @list_sequence(ptr %A) {
diff --git a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
index cafd68e508255..7ad8cbf963f45 100644
--- a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
+++ b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/Intrinsics/llvm-expect.ll b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
index 47fd4f07e4678..ba4ea1565e481 100644
--- a/polly/test/CodeGen/Intrinsics/llvm-expect.ll
+++ b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; Check that we generate code without crashing.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
index eb7de01ba862c..a92917f30b724 100644
--- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
+++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
@@ -1,6 +1,6 @@
 ; This test checks that we do not accidentally mutate the debug info when
 ; inserting loop parallel metadata.
-; RUN: opt %loadNPMPolly < %s  -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel < %s | FileCheck %s
 ; CHECK-NOT: !7 = !{!7}
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
index 9bb086fa79aed..0d947004aea50 100644
--- a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s
 ;
 ; Check that we mark multiple parallel loops correctly including the memory instructions.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
index 442600cff7a0a..1293cd91da78d 100644
--- a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; This is a trivially parallel loop. We just use it to ensure that we actually
diff --git a/polly/test/CodeGen/MemAccess/bad_alignment.ll b/polly/test/CodeGen/MemAccess/bad_alignment.ll
index 82fff27dd0eb7..be1c64938422c 100644
--- a/polly/test/CodeGen/MemAccess/bad_alignment.ll
+++ b/polly/test/CodeGen/MemAccess/bad_alignment.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -disable-output 2>&1 < %s | FileCheck %s
 ;
 ; Check that we do not allow to access elements not accessed before because the
 ; alignment information would become invalid.
diff --git a/polly/test/CodeGen/MemAccess/codegen_address_space.ll b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
index 3360e10529f8e..283c8fbd2c249 100644
--- a/polly/test/CodeGen/MemAccess/codegen_address_space.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
index 0563ca87eef51..ce44f2daceaa9 100644
--- a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple.ll b/polly/test/CodeGen/MemAccess/codegen_simple.ll
index ee0187fe97d25..ab1dca516a9cf 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
index 6970565bf023e..72f9c2ce61e3c 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ;float A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
index f0896e2bf6093..a6d9969286fc7 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
@@ -1,5 +1,5 @@
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s
 
 ;int A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
index 99fc36996f083..568b0ff4ae20a 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
@@ -1,5 +1,5 @@
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
-;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s
 ;
 ;float A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/create_arrays.ll b/polly/test/CodeGen/MemAccess/create_arrays.ll
index 40ae8d6efa95f..8443e0f7be327 100644
--- a/polly/test/CodeGen/MemAccess/create_arrays.ll
+++ b/polly/test/CodeGen/MemAccess/create_arrays.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;  for (i = 0; i < _PB_NI; i++)
 ;    for (j = 0; j < _PB_NJ; j++)
diff --git a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll
index 1202d21998c94..9c95378a76433 100644
--- a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll
+++ b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; #define Ni 1056
 ; #define Nj 1056
diff --git a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll
index 7d8083cc55846..f08fabd67ef5c 100644
--- a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll
+++ b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-import-jscop -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s
 ;
 ; Check that we allow the new access functions even though they access
 ; different locations than the original ones (but the alignment is the
diff --git a/polly/test/CodeGen/MemAccess/different_types.ll b/polly/test/CodeGen/MemAccess/different_types.ll
index 407e72702aa86..ae6168d235a96 100644
--- a/polly/test/CodeGen/MemAccess/different_types.ll
+++ b/polly/test/CodeGen/MemAccess/different_types.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN: \
-; RUN: -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s
 ;
 ;    void foo(float A[], float B[]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/MemAccess/generate-all.ll b/polly/test/CodeGen/MemAccess/generate-all.ll
index 7b2286bfc95a9..099a3e0670960 100644
--- a/polly/test/CodeGen/MemAccess/generate-all.ll
+++ b/polly/test/CodeGen/MemAccess/generate-all.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=false \
-; RUN:     -S < %s | FileCheck %s -check-prefix=SCEV
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=true \
-; RUN:     -S < %s | FileCheck %s -check-prefix=ASTEXPR
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=false -S < %s | FileCheck %s -check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=true -S < %s | FileCheck %s -check-prefix=ASTEXPR
 ;
 ;    void foo(float A[]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
index 5c926ac638413..d8d0df7009685 100644
--- a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
+++ b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN:   -polly-invariant-load-hoisting -S \
-; RUN:   2>&1 < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-invariant-load-hoisting -S 2>&1 < %s | FileCheck %s
 
 ; Setting new access functions where the base pointer of the array that is newly
 ; accessed is only loaded within the scop itself caused incorrect code to be
diff --git a/polly/test/CodeGen/MemAccess/map_scalar_access.ll b/polly/test/CodeGen/MemAccess/map_scalar_access.ll
index 7c845d4a004f4..4ea21b26ce531 100644
--- a/polly/test/CodeGen/MemAccess/map_scalar_access.ll
+++ b/polly/test/CodeGen/MemAccess/map_scalar_access.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-print-import-jscop -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-import-jscop -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s --check-prefix=CODEGEN
 
 define void @map_scalar_access(ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/CodeGen/MemAccess/multiple_types.ll b/polly/test/CodeGen/MemAccess/multiple_types.ll
index 7848977ce0310..edc3888be364b 100644
--- a/polly/test/CodeGen/MemAccess/multiple_types.ll
+++ b/polly/test/CodeGen/MemAccess/multiple_types.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' \
-; RUN: -polly-allow-differing-element-types \
-; RUN:   -S    < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-allow-differing-element-types -S < %s | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/CodeGen/MemAccess/simple.ll b/polly/test/CodeGen/MemAccess/simple.ll
index 5077e1a1b5a2c..63d66f1c925f7 100644
--- a/polly/test/CodeGen/MemAccess/simple.ll
+++ b/polly/test/CodeGen/MemAccess/simple.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -stats < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ;int A[100];
diff --git a/polly/test/CodeGen/MemAccess/simple_analyze.ll b/polly/test/CodeGen/MemAccess/simple_analyze.ll
index 143651b565aff..f07cb1629ca18 100644
--- a/polly/test/CodeGen/MemAccess/simple_analyze.ll
+++ b/polly/test/CodeGen/MemAccess/simple_analyze.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 @A = common global [100 x i32] zeroinitializer, align 4
diff --git a/polly/test/CodeGen/MemAccess/update_access_functions.ll b/polly/test/CodeGen/MemAccess/update_access_functions.ll
index 51fa97adb3c37..93f5f186ad6a5 100644
--- a/polly/test/CodeGen/MemAccess/update_access_functions.ll
+++ b/polly/test/CodeGen/MemAccess/update_access_functions.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN:                -polly-import-jscop-postfix=transformed \
-; RUN:                 < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 
 ; CHECK-LABEL: polly.stmt.loop1:
 ; CHECK-NEXT:   %3 = mul nsw i64 5, %polly.indvar{{[0-9]*}}
diff --git a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll
index ebe91636ea3cc..344a6d0990837 100644
--- a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll
+++ b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-annotate-metadata-vectorize < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-annotate-metadata-vectorize < %s | FileCheck %s
 
 ; Basic verification of vectorize metadata getting added when "-polly-vectorize-metadata" is
 ; passed.
diff --git a/polly/test/CodeGen/OpenMP/alias-metadata.ll b/polly/test/CodeGen/OpenMP/alias-metadata.ll
index 121f630789892..541fbdda5a6b9 100644
--- a/polly/test/CodeGen/OpenMP/alias-metadata.ll
+++ b/polly/test/CodeGen/OpenMP/alias-metadata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s
 ;
 ;    void foo(float *A, float *B) {
 ;      for (long i = 0; i < 1000; i++)
diff --git a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
index 7177ae01f0754..657921690c74d 100644
--- a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
+++ b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Check that we do not crash but generate parallel code
 ;
diff --git a/polly/test/CodeGen/OpenMP/inlineasm.ll b/polly/test/CodeGen/OpenMP/inlineasm.ll
index 82a73780886e3..ac6c7070c1abf 100644
--- a/polly/test/CodeGen/OpenMP/inlineasm.ll
+++ b/polly/test/CodeGen/OpenMP/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,polly-codegen' -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -polly-parallel -S < %s | FileCheck %s
 ; llvm.org/PR51960
 
 ; CHECK-LABEL: define internal void @foo_polly_subfn
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
index aba3ae78f7783..08c0cc7fe37f2 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
-; RUN: -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
index 8cf6148a7b44c..8246aaa25b7b2 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
-; RUN: -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
index 823e5cab55ab3..0c5208c77768b 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
-; RUN: -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction but
 ; not B[0] as it is not needed
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
index 5557839e715ed..fd039e75444b5 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
-; RUN: -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
index a987fac31b743..fe8b8a3a022bc 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This code has failed the scev based code generation as the scev in the scop
 ; contains an AddRecExpr of an outer loop. When generating code, we did not
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
index 96c6d900a7a00..d1f48d92e0e75 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; AST: #pragma simd
 ; AST: #pragma omp parallel for
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
index c4ad665c7b6cf..5b032801c7282 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; The interesting part of this test case is the instruction:
 ;   %tmp = bitcast i8* %call to i64**
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
index 82acba8b3c523..d612faf7b67c5 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR
 
 ; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
index aa44658131bba..213cc2635fb6d 100644
--- a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
+++ b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR
 ;
 ; float A[100];
 ;
diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
index 4deab1af0ccf0..fef23f141eaeb 100644
--- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
+++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly-delicm,polly-codegen' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly<no-default-opts;delicm>' -S < %s | FileCheck %s
 ;
 ; Verify that -polly-parallel can handle mapped scalar MemoryAccesses.
 ;
diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
index 43326b29f7ef1..fd8ce87b45ae8 100644
--- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll
+++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,print<polly-ast>' -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,polly-codegen' -S < %s | FileCheck --check-prefix=CODEGEN %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck --check-prefix=CODEGEN %s
 ; REQUIRES: asserts
 
 ; Parallelization of detected matrix-multiplication.
diff --git a/polly/test/CodeGen/OpenMP/new_multidim_access.ll b/polly/test/CodeGen/OpenMP/new_multidim_access.ll
index 5faabb4d20c1a..8018acdcb0e6a 100644
--- a/polly/test/CodeGen/OpenMP/new_multidim_access.ll
+++ b/polly/test/CodeGen/OpenMP/new_multidim_access.ll
@@ -1,10 +1,6 @@
-; RUN: opt %loadPolly -polly-print-import-jscop \
-; RUN:                -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s
 
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:                -polly-codegen -S < %s \
-; RUN:                -polly-parallel \
-; RUN:                | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S -polly-parallel < %s | FileCheck %s -check-prefix=IR
 
 ;    void new_multidim_access(long n, long m, float A[][m]) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/CodeGen/OpenMP/recomputed-srem.ll b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
index b7b3a44610f32..99069612cd1d4 100644
--- a/polly/test/CodeGen/OpenMP/recomputed-srem.ll
+++ b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -polly-parallel \
-; RUN: -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we pass %rem96 to the parallel subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
index c207f589e4da0..236362a3e23dc 100644
--- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
+++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
@@ -1,17 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-parallel \
-; RUN: -polly-parallel-force -passes=polly-codegen \
-; RUN: -S -verify-dom-info < %s \
-; RUN: | FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadNPMPolly -polly-parallel \
-; RUN: -polly-parallel-force -passes=polly-codegen -polly-scheduling=runtime \
-; RUN: -S -verify-dom-info < %s \
-; RUN: | FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadNPMPolly -polly-parallel \
-; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
-; RUN: -S -verify-dom-info < %s \
-; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-scheduling=runtime -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR
 
 ; IR: @GOMP_parallel_loop_runtime_start
 
diff --git a/polly/test/CodeGen/OpenMP/reference-other-bb.ll b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
index dbfbd9a905086..9925187883173 100644
--- a/polly/test/CodeGen/OpenMP/reference-other-bb.ll
+++ b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; IR: @foo_polly_subfn
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
index ee43b8aa34a44..3738266b558ed 100644
--- a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
+++ b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 
 ; - Test the case where scalar evolution references a loop that is outside
diff --git a/polly/test/CodeGen/OpenMP/reference_latest.ll b/polly/test/CodeGen/OpenMP/reference_latest.ll
index 7a8cd77bb1571..fb420b06b9afb 100644
--- a/polly/test/CodeGen/OpenMP/reference_latest.ll
+++ b/polly/test/CodeGen/OpenMP/reference_latest.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-codegen' -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;delicm;simplify>' -polly-parallel -S < %s | FileCheck %s
 ;
 ; Test that parallel codegen handles scalars mapped to other arrays.
 ; After mapping "store double %add10" references the array "MemRef2".
diff --git a/polly/test/CodeGen/OpenMP/scev-rewriting.ll b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
index 9b79f29094482..861a78e4acd7a 100644
--- a/polly/test/CodeGen/OpenMP/scev-rewriting.ll
+++ b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -passes=polly-codegen -S | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ; CHECK: define internal void @DoStringSort_polly_subfn
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnueabi"
diff --git a/polly/test/CodeGen/OpenMP/single_loop.ll b/polly/test/CodeGen/OpenMP/single_loop.ll
index e5aee840ade74..5e8a58fadd56c 100644
--- a/polly/test/CodeGen/OpenMP/single_loop.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop.ll
@@ -1,14 +1,14 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
 
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
 
 ; This extensive test case tests the creation of the full set of OpenMP calls
 ; as well as the subfunction creation using a trivial loop as example.
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
index c519bfdee7a58..95324793f4fa4 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; #define N 1024
 ; float A[N];
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
index f6dfd62d6bcc1..7334762f84f6c 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
@@ -1,18 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-parallel \
-; RUN: -polly-parallel-force -passes=polly-codegen \
-; RUN: -S -verify-dom-info < %s \
-; RUN: | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadNPMPolly -polly-parallel \
-; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
-; RUN: -S -verify-dom-info < %s \
-; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR
 
-; RUN: opt %loadNPMPolly -polly-parallel \
-; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
-; RUN: -polly-scheduling=static \
-; RUN: -S -verify-dom-info < %s \
-; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-STATIC-IR
 
 ; Ensure the scalars are initialized before the OpenMP code is launched.
 ;
diff --git a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
index 934e04461f134..77c1b23a3f76c 100644
--- a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
+++ b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This test case verifies that we create correct code even if two OpenMP loops
 ; share common outer variables.
diff --git a/polly/test/CodeGen/PHIInExit.ll b/polly/test/CodeGen/PHIInExit.ll
index 3e0c9d67d5ca8..39bdac793e8a1 100644
--- a/polly/test/CodeGen/PHIInExit.ll
+++ b/polly/test/CodeGen/PHIInExit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 %struct..0__pthread_mutex_s = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_list_t }
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
index ccb0d15cfc3d2..9ec9804d35b0d 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-codegen-add-debug-printing \
-; RUN: -polly-ignore-aliasing < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-codegen-add-debug-printing -polly-ignore-aliasing < %s | FileCheck %s
 
 ;    #define N 10
 ;    void foo(float A[restrict], double B[restrict], char C[restrict],
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
index 4ffb7fd6e4621..736c136eeb67c 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 
 define void @func(i32 %n, ptr %A) {
diff --git a/polly/test/CodeGen/alias-check-multi-dim.ll b/polly/test/CodeGen/alias-check-multi-dim.ll
index 0440bda74b391..bab2690bddb17 100644
--- a/polly/test/CodeGen/alias-check-multi-dim.ll
+++ b/polly/test/CodeGen/alias-check-multi-dim.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: sext i32 %indvar.init to i64
diff --git a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
index 4186b8521a535..37ec2d5b748af 100644
--- a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
+++ b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -S < %s \
-; RUN:   | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -S < %s | FileCheck %s
 ;
 ;    void manyarrays(float A1[], float A2[], float A3[], float A4[], float A5[],
 ;                    float A6[], float A7[], float A8[], float A9[]) {
diff --git a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
index 8e1fc3b328355..7fed270cb51dd 100644
--- a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
+++ b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; We have to cast %B to "short *" before we create RTCs.
 ;
diff --git a/polly/test/CodeGen/aliasing_different_pointer_types.ll b/polly/test/CodeGen/aliasing_different_pointer_types.ll
index e601c22b978da..5326af339ddac 100644
--- a/polly/test/CodeGen/aliasing_different_pointer_types.ll
+++ b/polly/test/CodeGen/aliasing_different_pointer_types.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Check that we cast the different pointer types correctly before we compare
 ; them in the RTC's. We use i8* as max pointer type.
diff --git a/polly/test/CodeGen/aliasing_multidimensional_access.ll b/polly/test/CodeGen/aliasing_multidimensional_access.ll
index e1dae03280a0e..5d0b40d6b59aa 100644
--- a/polly/test/CodeGen/aliasing_multidimensional_access.ll
+++ b/polly/test/CodeGen/aliasing_multidimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; Check that we calculate the maximal access into array A correctly and track the overflow state.
 ;
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_1.ll b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
index a79ba2532535d..1b7b85835d795 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_1.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_2.ll b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
index efe4af1c9e7c5..fa8053ccabbea 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_2.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_struct_element.ll b/polly/test/CodeGen/aliasing_struct_element.ll
index 3079e58d7daba..4e8570944f6c6 100644
--- a/polly/test/CodeGen/aliasing_struct_element.ll
+++ b/polly/test/CodeGen/aliasing_struct_element.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; We should only access (or compute the address of) "the first element" of %S
 ; as it is a single struct not a struct array. The maximal access to S, thus
diff --git a/polly/test/CodeGen/alignment.ll b/polly/test/CodeGen/alignment.ll
index e0f6a959476f6..daf7999c8072b 100644
--- a/polly/test/CodeGen/alignment.ll
+++ b/polly/test/CodeGen/alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Check that the special alignment information is kept
 ;
diff --git a/polly/test/CodeGen/annotated_alias_scopes.ll b/polly/test/CodeGen/annotated_alias_scopes.ll
index ada03e0663722..7d2d9038270a9 100644
--- a/polly/test/CodeGen/annotated_alias_scopes.ll
+++ b/polly/test/CodeGen/annotated_alias_scopes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=SCOPES
 ;
 ; Check that we create alias scopes that indicate the accesses to A, B and C cannot alias in any way.
 ;
diff --git a/polly/test/CodeGen/blas_sscal_simplified.ll b/polly/test/CodeGen/blas_sscal_simplified.ll
index 99f2eae9dd8e5..461af09b5b289 100644
--- a/polly/test/CodeGen/blas_sscal_simplified.ll
+++ b/polly/test/CodeGen/blas_sscal_simplified.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 ;
 ; Regression test for a bug in the runtime check generation.
 
diff --git a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
index 5dba93373b70b..5eb6076892f3e 100644
--- a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
+++ b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ; CHECK: store i32 %tmp14_p_scalar_, ptr %tmp14.s2a
 ; CHECK: %tmp14.final_reload = load i32, ptr %tmp14.s2a
diff --git a/polly/test/CodeGen/constant_condition.ll b/polly/test/CodeGen/constant_condition.ll
index 905aa52df5080..9d3c5a811b16a 100644
--- a/polly/test/CodeGen/constant_condition.ll
+++ b/polly/test/CodeGen/constant_condition.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s
 
 ;#include <string.h>
 ;int A[1];
diff --git a/polly/test/CodeGen/create-conditional-scop.ll b/polly/test/CodeGen/create-conditional-scop.ll
index b8c9a81b71a91..d4df48b757d3d 100644
--- a/polly/test/CodeGen/create-conditional-scop.ll
+++ b/polly/test/CodeGen/create-conditional-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -verify-loop-info < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -verify-loop-info -S < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
index dfef4202391d4..31b5e69ae4c6a 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 ;
 ; Check we do not crash even though the dead %tmp8 is referenced by a parameter
 ; and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
index fcc6764ce9c21..88b844bea5e4e 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 ;
 ; Check we do not crash even though there is a dead load that is referenced by
 ; a parameter and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll
index ed4b81a8e3a3c..f397a4b83d88a 100644
--- a/polly/test/CodeGen/debug-intrinsics.ll
+++ b/polly/test/CodeGen/debug-intrinsics.ll
@@ -1,10 +1,6 @@
-; RUN: opt %loadNPMPolly \
-; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen -S < %s | \
-; RUN: FileCheck %s
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
-; RUN: opt %loadNPMPolly \
-; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen -S < %s | \
-; RUN: FileCheck %s
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
index edc03333a358d..7f6f128c2cff2 100644
--- a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
+++ b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ; This caused dominance problems at some point as we do bail out during
 ; code generation. Just verify it runs through.
diff --git a/polly/test/CodeGen/empty_domain_in_context.ll b/polly/test/CodeGen/empty_domain_in_context.ll
index a2fe805f402e0..f6c39eb0517bc 100644
--- a/polly/test/CodeGen/empty_domain_in_context.ll
+++ b/polly/test/CodeGen/empty_domain_in_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-optree,polly-opt-isl,polly-codegen' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;opt-isl;codegen>' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR35362
 ; isl codegen does not allow to generate isl_ast_expr from pw_aff which have an
diff --git a/polly/test/CodeGen/entry_with_trivial_phi.ll b/polly/test/CodeGen/entry_with_trivial_phi.ll
index f2c9da04d6495..09570938a9ca1 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node. LCSSA may create such PHI nodes. This is a breakdown of this case in
diff --git a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
index 2f1ec1a7872aa..7d8ef7acf9435 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node that is used in a different of the scop region. LCSSA may create such
diff --git a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
index 63b6becd19574..c5c11c8ea2f8f 100644
--- a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
+++ b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; CHECK-LABEL: polly.stmt.if.then:
diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
index abec28894f45b..1e38210c733d9 100644
--- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
+++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/exprModDiv.ll b/polly/test/CodeGen/exprModDiv.ll
index c9b419abe3242..b123e90c07882 100644
--- a/polly/test/CodeGen/exprModDiv.ll
+++ b/polly/test/CodeGen/exprModDiv.ll
@@ -1,8 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN:     -S < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN:     -polly-import-jscop-postfix=pow2 \
-; RUN:     -S < %s | FileCheck %s -check-prefix=POW2
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=pow2 -S < %s | FileCheck %s -check-prefix=POW2
 ;
 ;    void exprModDiv(float *A, float *B, float *C, long N, long p) {
 ;      for (long i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
index 1ca2413fd5e19..c7873baeeaeb7 100644
--- a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
+++ b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
-; RUN: -polly-invariant-load-hoisting=false < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=false < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; Check that we generate valid code even if the load of cont_STACKPOINTER is
 ; hoisted in one SCoP and used (through the phi node %tmp2).
diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll
index aa29bfd7dbcbc..31ae969cd3156 100644
--- a/polly/test/CodeGen/hoisting_1.ll
+++ b/polly/test/CodeGen/hoisting_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll
index 1b913f2cb07be..eb6f7ae5ff6d1 100644
--- a/polly/test/CodeGen/hoisting_2.ll
+++ b/polly/test/CodeGen/hoisting_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/inner_scev_sdiv_1.ll b/polly/test/CodeGen/inner_scev_sdiv_1.ll
index d210105c46baf..f7595a6afb0be 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_1.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s
 ;
 ; Excerpt from the test-suite's oggenc reduced using bugpoint.
 ;
diff --git a/polly/test/CodeGen/inner_scev_sdiv_2.ll b/polly/test/CodeGen/inner_scev_sdiv_2.ll
index 33233fe2fdf17..247c102834b25 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_2.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; The SCEV expression in this test case refers to a sequence of sdiv
 ; instructions, which are part of different bbs in the SCoP. When code
diff --git a/polly/test/CodeGen/inner_scev_sdiv_3.ll b/polly/test/CodeGen/inner_scev_sdiv_3.ll
index a8c626347efe9..fc1cce41c0f4e 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_3.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; This test case has a inner SCEV sdiv that will escape the SCoP. Just check we
 ; do not crash and generate valid code.
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
index 31c14e85f253e..1ff598a4a021a 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; CHECK: [N] -> { Stmt_bb11[i0, i1] : i0 < N and i1 >= 0 and 3i1 <= -3 + i0 };
 ; CODEGEN: polly
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
index b42371b0891e6..4cd146ddbf62e 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
-; RUN:     < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; Check that this will not crash our code generation.
 ;
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
index 45af63402c986..586875bbefcbe 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This will just check that we generate valid code here.
 ;
diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll
index a708548798ebb..0f35664eb7e1c 100644
--- a/polly/test/CodeGen/intrinsics_lifetime.ll
+++ b/polly/test/CodeGen/intrinsics_lifetime.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Verify that we remove the lifetime markers from everywhere.
 ;
diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll
index a643b8accd4e9..4a64c1a641182 100644
--- a/polly/test/CodeGen/intrinsics_misc.ll
+++ b/polly/test/CodeGen/intrinsics_misc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Verify that we remove the misc intrinsics  from the optimized SCoP.
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
index e7cbf748bea73..15fe0d9e22416 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
index 24e9240c234d1..c1ab026e97701 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
index d1d861e316ee4..f0c833ce1bce1 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
 ;
diff --git a/polly/test/CodeGen/invariant-load-dimension.ll b/polly/test/CodeGen/invariant-load-dimension.ll
index 21e53055c56b0..13576b9f40455 100644
--- a/polly/test/CodeGen/invariant-load-dimension.ll
+++ b/polly/test/CodeGen/invariant-load-dimension.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadNPMPolly -S < %s -passes=polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-process-unprofitable -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODEGEN
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
 
diff --git a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
index 1fd9cb81771c6..d92d97012b33c 100644
--- a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
+++ b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s
 ;
 ; Check that we generate valid code as we did non preload the base pointer
 ; origin of %tmp4 at some point.
diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll
index 420cb608f9ba4..f6dcac08dffca 100644
--- a/polly/test/CodeGen/invariant_cannot_handle_void.ll
+++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; The offset of the %tmp1 load wrt. to %buff (62 bytes) is not divisible
 ; by the type size (i32 = 4 bytes), thus we will have to represent %buff
diff --git a/polly/test/CodeGen/invariant_load.ll b/polly/test/CodeGen/invariant_load.ll
index 2d5e6042ea6a4..c89da73efc839 100644
--- a/polly/test/CodeGen/invariant_load.ll
+++ b/polly/test/CodeGen/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll
index 3d1958e5b8a43..7d5139cc55f88 100644
--- a/polly/test/CodeGen/invariant_load_address_space.ll
+++ b/polly/test/CodeGen/invariant_load_address_space.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr addrspace(1) %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_alias_metadata.ll b/polly/test/CodeGen/invariant_load_alias_metadata.ll
index 252463384a5c8..2a704ee9c576a 100644
--- a/polly/test/CodeGen/invariant_load_alias_metadata.ll
+++ b/polly/test/CodeGen/invariant_load_alias_metadata.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true \
-; RUN: -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; This test case checks whether Polly generates alias metadata in case of
 ; the ublas gemm kernel and polly-invariant-load-hoisting.
diff --git a/polly/test/CodeGen/invariant_load_base_pointer.ll b/polly/test/CodeGen/invariant_load_base_pointer.ll
index d4ac433475f05..f6b873994036c 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.BPLoc = getelementptr ptr, ptr %BPLoc, i64 0
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
index 06a9a93363ed9..4dbcc3b3b049d 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %0 = sext i32 %N to i64
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
index 66ab9a31b1032..39520c8fd8217 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
index fa904e9b96d34..414ca127a251f 100644
--- a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting < %s | FileCheck %s
 
 ; CHECK: %polly.access.A = getelementptr ptr, ptr %A, i64 0
 ; CHECK: %polly.access.A.load = load ptr, ptr %polly.access.A
diff --git a/polly/test/CodeGen/invariant_load_condition.ll b/polly/test/CodeGen/invariant_load_condition.ll
index 36e588329d669..f0782c023378b 100644
--- a/polly/test/CodeGen/invariant_load_condition.ll
+++ b/polly/test/CodeGen/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:     %polly.access.C = getelementptr i32, ptr %C, i64 0
diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll
index 0a88bb70966d2..034c3587a0708 100644
--- a/polly/test/CodeGen/invariant_load_different_sized_types.ll
+++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S \
-; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S -polly-allow-differing-element-types < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll
index 416148b72303b..85578d3ba0992 100644
--- a/polly/test/CodeGen/invariant_load_escaping.ll
+++ b/polly/test/CodeGen/invariant_load_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ;    int f(int *A, int *B) {
 ;      // Possible aliasing between A and B but if not then *B would be
diff --git a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
index 906bfc1805d39..ff6e9a8e3ddae 100644
--- a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
+++ b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true  -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ;    void fence(void);
 ;
diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
index ab02e639f0d2a..edd38cab2afba 100644
--- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
+++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; This crashed at some point as the invariant load is in a non-affine
 ; subregion. Just check it does not anymore.
diff --git a/polly/test/CodeGen/invariant_load_loop_ub.ll b/polly/test/CodeGen/invariant_load_loop_ub.ll
index 1db27ad8e58ba..923102440c547 100644
--- a/polly/test/CodeGen/invariant_load_loop_ub.ll
+++ b/polly/test/CodeGen/invariant_load_loop_ub.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK: polly.start
 ;
diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
index 5a11adcdebbc5..0e381b863fb8b 100644
--- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
+++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Check that this does not crash as the invariant load is not executed (thus
 ; not preloaded) but still referenced by one of the parameters.
diff --git a/polly/test/CodeGen/invariant_load_outermost.ll b/polly/test/CodeGen/invariant_load_outermost.ll
index 7e0550fb3be94..bbbe1f1663964 100644
--- a/polly/test/CodeGen/invariant_load_outermost.ll
+++ b/polly/test/CodeGen/invariant_load_outermost.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
index abf957b556daa..9fe343f752d14 100644
--- a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
+++ b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; SCOP:         Assumed Context:
 ; SCOP-NEXT:    [p_0, tmp4] -> {  :  }
diff --git a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
index b565f1bd5096a..dc1c2bca4b6e3 100644
--- a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S  < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:   %polly.access.A = getelementptr ptr, ptr %A, i64 42
diff --git a/polly/test/CodeGen/invariant_load_scalar_dep.ll b/polly/test/CodeGen/invariant_load_scalar_dep.ll
index ba2999e27984d..bb60c50b1ab40 100644
--- a/polly/test/CodeGen/invariant_load_scalar_dep.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
index 26c964c9c6a72..87c407e05b972 100644
--- a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; Verify the preloaded %tmp0 is stored and communicated in the same alloca.
 ; In this case, we do not reload %ncol.load from the scalar stack slot, but
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
index 6bf11d5697bd7..5e2b28c53019e 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
index 07ce941522459..20d9f6d40b7d6 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
index 19b30afd33ba7..51f8a55d1a400 100644
--- a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
+++ b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting \
-; RUN:     -polly-ignore-parameter-bounds -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting -polly-ignore-parameter-bounds -S < %s | FileCheck %s
 
 ; CHECK: polly.preload.begin:
 ; CHECK-NEXT: %global.load = load i32, ptr @global, align 4, !alias.scope !0, !noalias !3
diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll
index 1dcc175ebb163..432c155fdd3ae 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This crashed at some point as the pointer returned by the call
 ; to @__errno_location is invariant and defined in the SCoP but not
diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
index 43b3d99e11a2f..65ba2cd993193 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -S '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; Check we generate valid code.
 
diff --git a/polly/test/CodeGen/issue56692.ll b/polly/test/CodeGen/issue56692.ll
index 34c4e398e2ac0..5e225d73bdcd3 100644
--- a/polly/test/CodeGen/issue56692.ll
+++ b/polly/test/CodeGen/issue56692.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ; https://github.com/llvm/llvm-project/issues/56692
 ;
 ; CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}), !dbg ![[OPTLOC:[0-9]+]]
diff --git a/polly/test/CodeGen/large-numbers-in-boundary-context.ll b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
index b228baf9bdf22..4d55273618df6 100644
--- a/polly/test/CodeGen/large-numbers-in-boundary-context.ll
+++ b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; The boundary context contains a constant that does not fit in 64 bits. Hence,
diff --git a/polly/test/CodeGen/load_subset_with_context.ll b/polly/test/CodeGen/load_subset_with_context.ll
index ccd4198b9fe85..33b3d3b72225f 100644
--- a/polly/test/CodeGen/load_subset_with_context.ll
+++ b/polly/test/CodeGen/load_subset_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; A load must provide a value for every statement instance.
 ; Statement instances not in the SCoP's context are irrelevant.
diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
index f43247b3e5057..dc0c5517d7ca5 100644
--- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
+++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/loop_with_condition.ll b/polly/test/CodeGen/loop_with_condition.ll
index 49e312404cca8..cf28a4de63f3b 100644
--- a/polly/test/CodeGen/loop_with_condition.ll
+++ b/polly/test/CodeGen/loop_with_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_2.ll b/polly/test/CodeGen/loop_with_condition_2.ll
index 8ae38eeeb4982..1d8a8132a79cb 100644
--- a/polly/test/CodeGen/loop_with_condition_2.ll
+++ b/polly/test/CodeGen/loop_with_condition_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 
 ; Verify that we actually detect this loop as the innermost loop even though
 ; there is a conditional inside.
diff --git a/polly/test/CodeGen/loop_with_condition_ineq.ll b/polly/test/CodeGen/loop_with_condition_ineq.ll
index 64019a6090212..c222f67ed7836 100644
--- a/polly/test/CodeGen/loop_with_condition_ineq.ll
+++ b/polly/test/CodeGen/loop_with_condition_ineq.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_nested.ll b/polly/test/CodeGen/loop_with_condition_nested.ll
index 5dcb51dcb91cd..32256a7344664 100644
--- a/polly/test/CodeGen/loop_with_condition_nested.ll
+++ b/polly/test/CodeGen/loop_with_condition_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 
 
 ;#include <string.h>
diff --git a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
index 26fe4eb82ae49..5d7f67f1f9060 100644
--- a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
+++ b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Test case to trigger the hard way of creating a unique entering
 ; edge for the SCoP. It is triggered because the entering edge
diff --git a/polly/test/CodeGen/memcpy_annotations.ll b/polly/test/CodeGen/memcpy_annotations.ll
index 501aa8fbea4d6..c3ffe4abcddd6 100644
--- a/polly/test/CodeGen/memcpy_annotations.ll
+++ b/polly/test/CodeGen/memcpy_annotations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Verify that @llvm.memcpy does not get a !alias.scope annotation.
 ; @llvm.memcpy takes two pointers, it is ambiguous to which the
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
index f63eb18118e77..b084672971855 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ; CHECK: polly
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize.ll b/polly/test/CodeGen/multidim-non-matching-typesize.ll
index 63e43c83ada5f..66a4fdf42bc8e 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 
diff --git a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
index 86b17573caada..d3f8b718889e4 100644
--- a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/CodeGen/multidim_alias_check.ll b/polly/test/CodeGen/multidim_alias_check.ll
index 93e34e2fd0fc1..e85d7c9e7785d 100644
--- a/polly/test/CodeGen/multidim_alias_check.ll
+++ b/polly/test/CodeGen/multidim_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: %polly.access.sext.A = sext i32 %n to i64
diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll
index a63f8a615ff9e..cb12700bfb561 100644
--- a/polly/test/CodeGen/multiple-codegens.ll
+++ b/polly/test/CodeGen/multiple-codegens.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen,polly-codegen)" -S < %s | FileCheck %s
-; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen),scop(polly-codegen)" -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>,polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=function(polly<no-default-opts;opt-isl>),function(polly<no-default-opts>)' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR34441
 ; Properly handle multiple -polly-scops/-polly-codegen in the same
diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll
index effae223c152a..b92359782d999 100644
--- a/polly/test/CodeGen/multiple-scops-in-a-row.ll
+++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 
 ; This test case has two scops in a row. When code generating the first scop,
 ; the second scop is invalidated. This test case verifies that we do not crash
diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
index 101fcaff0c82e..96615079be365 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-allow-differing-element-types < %s | FileCheck %s
 
 ; CHECK: polly
 
diff --git a/polly/test/CodeGen/multiple-types-invariant-load.ll b/polly/test/CodeGen/multiple-types-invariant-load.ll
index 930041eaddaad..ca89cb53e09b7 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-differing-element-types -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; CHECK: %polly.access.global.load = getelementptr i32, ptr %global.load, i64 0
 ; CHECK: %polly.access.global.load.load = load i32, ptr %polly.access.global.load
diff --git a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
index 1e06a7e186bb0..8198108b22059 100644
--- a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
+++ b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadNPMPolly -polly-position=before-vectorizer -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR
 
 ; The IR has two ScopArrayInfo for the value %next.0. This used to produce two
 ; phi nodes in polly.merge_new_and_old, one illegaly using the result of the
diff --git a/polly/test/CodeGen/no-overflow-tracking.ll b/polly/test/CodeGen/no-overflow-tracking.ll
index d5ad9a7aef239..f915b5a0772e6 100644
--- a/polly/test/CodeGen/no-overflow-tracking.ll
+++ b/polly/test/CodeGen/no-overflow-tracking.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/no_guard_bb.ll b/polly/test/CodeGen/no_guard_bb.ll
index a022083f43a9e..604c5ac54bcdb 100644
--- a/polly/test/CodeGen/no_guard_bb.ll
+++ b/polly/test/CodeGen/no_guard_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s
 ;
 ; CHECK-NOT: br i1 true, label %polly.{{.*}}, label %polly.{{.*}}
 ;
diff --git a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
index 6015516a3bc49..ebb02a90ffb5d 100644
--- a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
+++ b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; Scalar reloads in the generated entering block were not recognized as
diff --git a/polly/test/CodeGen/non-affine-exit-node-dominance.ll b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
index 0d0f634ed7c16..ff9f504295672 100644
--- a/polly/test/CodeGen/non-affine-exit-node-dominance.ll
+++ b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; The dominance of the generated non-affine subregion block was based on the
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
index bfa3c156ea75d..2ad1e75216362 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
index b9386333a79b4..386fe5f9f207f 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
 entry:
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
index 6460c427270f4..5e5f34d99bde3 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
 entry:
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion.ll b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
index 1b6802f1a4c35..db9f0d518041b 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.wombat = type {[4 x i32]}
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
index 007a4c586aa32..096eb8609e1bb 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same operand
 ; of the PHI node in the non-affine region was synthesized at the wrong place.
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
index 20edbf2bd6c03..2810a8ab5361f 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same BBMap was
 ; used for the whole non-affine region. When %add is synthesized for the
diff --git a/polly/test/CodeGen/non-affine-region-implicit-store.ll b/polly/test/CodeGen/non-affine-region-implicit-store.ll
index 0ff39d3fe882d..cdb2000d90d6b 100644
--- a/polly/test/CodeGen/non-affine-region-implicit-store.ll
+++ b/polly/test/CodeGen/non-affine-region-implicit-store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25438
 ; After loop versioning, a dominance check of a non-affine subregion's exit node
diff --git a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
index 7df3d8976ea80..b4889c76079cc 100644
--- a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
+++ b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops \
-; RUN: -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -S < %s | FileCheck %s
 
 ; This test verifies that values defined in another scop statement and used by
 ; PHI-nodes in non-affine regions are code generated correctly.
diff --git a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
index 179062dd62d0a..45465c627f55a 100644
--- a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
+++ b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info \
-; RUN:     < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s
 ;
 ; Check that we do not reuse the B[i-1] GEP created in block S again in
 ; block Q. Hence, we create two GEPs for B[i-1]:
diff --git a/polly/test/CodeGen/non-affine-switch.ll b/polly/test/CodeGen/non-affine-switch.ll
index 427e7e2461f1d..90d5efdc3a9f5 100644
--- a/polly/test/CodeGen/non-affine-switch.ll
+++ b/polly/test/CodeGen/non-affine-switch.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly \
-; RUN: -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
index 292c0f2b53941..5bb4fd19f4fd1 100644
--- a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
+++ b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25412
 ; %synthgep caused %gep to be synthesized in subregion_if which was reused for
diff --git a/polly/test/CodeGen/non-affine-update.ll b/polly/test/CodeGen/non-affine-update.ll
index 03f091a405017..582607787eb7d 100644
--- a/polly/test/CodeGen/non-affine-update.ll
+++ b/polly/test/CodeGen/non-affine-update.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s
 ;
 ;    void non-affine-update(double A[], double C[], double B[]) {
 ;      for (int i = 0; i < 10; i++) {
diff --git a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
index 153cdb7ed9f6c..eaf74d9c63e0e 100644
--- a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
+++ b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non_affine_float_compare.ll b/polly/test/CodeGen/non_affine_float_compare.ll
index a359b662e6579..9709e231a4e86 100644
--- a/polly/test/CodeGen/non_affine_float_compare.ll
+++ b/polly/test/CodeGen/non_affine_float_compare.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen \
-; RUN:     -polly-allow-nonaffine-branches -S -verify-dom-info \
-; RUN:     < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-branches -S -verify-dom-info < %s | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/only_non_affine_error_region.ll b/polly/test/CodeGen/only_non_affine_error_region.ll
index 445cef0d6f697..be7a8a23df869 100644
--- a/polly/test/CodeGen/only_non_affine_error_region.ll
+++ b/polly/test/CodeGen/only_non_affine_error_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; CHECK-NOT: polly.start
 ;
diff --git a/polly/test/CodeGen/openmp_limit_threads.ll b/polly/test/CodeGen/openmp_limit_threads.ll
index 4c33be3407251..730c57299d569 100644
--- a/polly/test/CodeGen/openmp_limit_threads.ll
+++ b/polly/test/CodeGen/openmp_limit_threads.ll
@@ -1,10 +1,10 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
 
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
 
 ; Ensure that the provided thread numbers are forwarded to the OpenMP calls.
 ;
diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
index dd0a24b14a3b8..8d5f74751af49 100644
--- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll
+++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/param_div_div_div_2.ll b/polly/test/CodeGen/param_div_div_div_2.ll
index 8eba6444abb16..3ae95020d52dd 100644
--- a/polly/test/CodeGen/param_div_div_div_2.ll
+++ b/polly/test/CodeGen/param_div_div_div_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; Check that we guard the divisions because we moved them and thereby increased
 ; their domain.
diff --git a/polly/test/CodeGen/partial_write_array.ll b/polly/test/CodeGen/partial_write_array.ll
index fad4b21cf3dc8..fe5fd8cffece7 100644
--- a/polly/test/CodeGen/partial_write_array.ll
+++ b/polly/test/CodeGen/partial_write_array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of an array access.
 ;
diff --git a/polly/test/CodeGen/partial_write_emptyset.ll b/polly/test/CodeGen/partial_write_emptyset.ll
index 67828808e2fac..d0e5615e4220d 100644
--- a/polly/test/CodeGen/partial_write_emptyset.ll
+++ b/polly/test/CodeGen/partial_write_emptyset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write, where "partial" is the empty set.
 ; The store is never executed in this case and we do generate it in the
diff --git a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
index b26bd81b5663b..a36414297485a 100644
--- a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
+++ b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; CHECK:      polly.stmt.if.then81:                             ; preds = %polly.stmt.if.end75
 ; CHECK-NEXT:   store float undef, ptr %fX64, align 4, !alias.scope !0, !noalias !3
diff --git a/polly/test/CodeGen/partial_write_impossible_restriction.ll b/polly/test/CodeGen/partial_write_impossible_restriction.ll
index 7577b137a2750..e0069ebc8eae8 100644
--- a/polly/test/CodeGen/partial_write_impossible_restriction.ll
+++ b/polly/test/CodeGen/partial_write_impossible_restriction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; The isl scheduler isolates %cond.false into two instances.
 ; A partial write access in one of the instances was never executed,
diff --git a/polly/test/CodeGen/partial_write_in_region.ll b/polly/test/CodeGen/partial_write_in_region.ll
index 7c138c82091e5..e7f4225cf9310 100644
--- a/polly/test/CodeGen/partial_write_in_region.ll
+++ b/polly/test/CodeGen/partial_write_in_region.ll
@@ -1,7 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN: -polly-import-jscop-postfix=transformed \
-; RUN: -verify-dom-info \
-; RUN: -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -S < %s | FileCheck %s
 ;
 ;    void foo(long A[], float B[], float C[]) {
 ;      for (long i = 0; i < 1024; i++) {
diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
index ba15a7871f431..85b56fefad809 100644
--- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll
+++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
@@ -1,7 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
-; RUN: -polly-import-jscop-postfix=transformed \
-; RUN: -verify-dom-info -polly-allow-nonaffine-loops \
-; RUN: -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -polly-allow-nonaffine-loops -S < %s | FileCheck %s
 
 ; This test verifies that partial writes within non-affine loops are code
 ; generated correctly.
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar.ll b/polly/test/CodeGen/partial_write_mapped_scalar.ll
index b8c413885cdb0..bb99d4ea086d2 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar.
 ;
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
index 8c1953a05ad3c..37a9d98c6a22e 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar in a non-affine subregion.
 ;
diff --git a/polly/test/CodeGen/perf_monitoring.ll b/polly/test/CodeGen/perf_monitoring.ll
index 4b91e5055c0b1..61f122228c377 100644
--- a/polly/test/CodeGen/perf_monitoring.ll
+++ b/polly/test/CodeGen/perf_monitoring.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
-; RUN:   -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
index d5c33d64f3418..4c47a12c12904 100644
--- a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
-; RUN:   -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
index ab99c4d2de062..6d09d8bf27ebe 100644
--- a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
-; RUN:   -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll
index 447a14e9999c2..2ccd7965bbeaf 100644
--- a/polly/test/CodeGen/phi-defined-before-scop.ll
+++ b/polly/test/CodeGen/phi-defined-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; CHECK-LABEL: polly.merge_new_and_old:
 ; CHECK-NEXT: %tmp7.ph.merge = phi ptr [ %tmp7.ph.final_reload, %polly.exiting ], [ %tmp7.ph, %bb6.region_exiting ]
diff --git a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
index e096aa2f4f8c0..1655104b08390 100644
--- a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
+++ b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; Make sure code generation does not break in case an 'error block' is detected
 ; outside of the scope. In this situation, we should not affect code generation.
diff --git a/polly/test/CodeGen/phi_condition_modeling_1.ll b/polly/test/CodeGen/phi_condition_modeling_1.ll
index 9d73d8a792558..1cadac0a5cf73 100644
--- a/polly/test/CodeGen/phi_condition_modeling_1.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_condition_modeling_2.ll b/polly/test/CodeGen/phi_condition_modeling_2.ll
index 2d1364842d735..8f2e2a517c96c 100644
--- a/polly/test/CodeGen/phi_condition_modeling_2.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_conditional_simple_1.ll b/polly/test/CodeGen/phi_conditional_simple_1.ll
index 25bcf2a118ef4..5f0f8de19f223 100644
--- a/polly/test/CodeGen/phi_conditional_simple_1.ll
+++ b/polly/test/CodeGen/phi_conditional_simple_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
index 43d29b9ec8649..703e55f15c084 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through.
 ;
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
index 9f28024fcfa0a..3d911e0d6a87f 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
index 73e99ac0f32c5..5f81f52078723 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
index 6c9bd56a98722..abb86e650ce2a 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_loop_carried_float.ll b/polly/test/CodeGen/phi_loop_carried_float.ll
index 4cb392d3353d3..47a8a8190c8d9 100644
--- a/polly/test/CodeGen/phi_loop_carried_float.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_loop_carried_float_escape.ll b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
index 9fd8ad413128a..81dd5cecd1878 100644
--- a/polly/test/CodeGen/phi_loop_carried_float_escape.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
@@ -1,8 +1,6 @@
-; RUN: opt %loadNPMPolly -S \
-; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' < %s | FileCheck %s
 
-; RUN: opt %loadNPMPolly -S \
-; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_scalar_simple_1.ll b/polly/test/CodeGen/phi_scalar_simple_1.ll
index 80a1c41b83ac0..6331c24da31b0 100644
--- a/polly/test/CodeGen/phi_scalar_simple_1.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N) {
 ;      for (int i = 1; i < N; i++)
diff --git a/polly/test/CodeGen/phi_scalar_simple_2.ll b/polly/test/CodeGen/phi_scalar_simple_2.ll
index 614c8acfb9f8e..0adadf6b90159 100644
--- a/polly/test/CodeGen/phi_scalar_simple_2.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
index 7e21666f1db00..4d6ede638c8f2 100644
--- a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
+++ b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; CHECK: polly.merge_new_and_old:
 ; CHECK:   %result.ph.merge = phi float [ %result.ph.final_reload, %polly.exiting ], [ %result.ph, %next.region_exiting ]
diff --git a/polly/test/CodeGen/phi_with_one_exit_edge.ll b/polly/test/CodeGen/phi_with_one_exit_edge.ll
index 36a8684dbc37a..4de24fb058c26 100644
--- a/polly/test/CodeGen/phi_with_one_exit_edge.ll
+++ b/polly/test/CodeGen/phi_with_one_exit_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ;
 ; CHECK: polly.merge_new_and_old:
diff --git a/polly/test/CodeGen/pointer-type-expressions-2.ll b/polly/test/CodeGen/pointer-type-expressions-2.ll
index 918e4c6c9c0b0..706b01d7f8ca5 100644
--- a/polly/test/CodeGen/pointer-type-expressions-2.ll
+++ b/polly/test/CodeGen/pointer-type-expressions-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo(ptr %start, ptr %end) {
diff --git a/polly/test/CodeGen/pointer-type-expressions.ll b/polly/test/CodeGen/pointer-type-expressions.ll
index e7feebc163d4b..2478e2238fd0e 100644
--- a/polly/test/CodeGen/pointer-type-expressions.ll
+++ b/polly/test/CodeGen/pointer-type-expressions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
index 9ee050a1e5070..cac6f4fdd16f1 100644
--- a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
+++ b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 
 ;    void f(int a[], int N, float *P, float *Q) {
diff --git a/polly/test/CodeGen/pointer_rem.ll b/polly/test/CodeGen/pointer_rem.ll
index b8202318a3eca..ca5d866ae6cce 100644
--- a/polly/test/CodeGen/pointer_rem.ll
+++ b/polly/test/CodeGen/pointer_rem.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(print<polly-ast>)' -disable-output -S < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(polly-codegen)' -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<ast>' -polly-print-scops -polly-print-ast -disable-output -S < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<codegen>' -polly-print-scops -S < %s | FileCheck %s --check-prefix=CODEGEN
 
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll
index 7547b0bbed749..94be6d7824921 100644
--- a/polly/test/CodeGen/pr25241.ll
+++ b/polly/test/CodeGen/pr25241.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; PR25241 (https://llvm.org/bugs/show_bug.cgi?id=25241)
 ; Ensure that synthesized values of a PHI node argument are generated in the
diff --git a/polly/test/CodeGen/ptrtoint_as_parameter.ll b/polly/test/CodeGen/ptrtoint_as_parameter.ll
index a551d810c0802..49a8c38309eb2 100644
--- a/polly/test/CodeGen/ptrtoint_as_parameter.ll
+++ b/polly/test/CodeGen/ptrtoint_as_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; CHECK:      if.then260:
 ; CHECK-NEXT:   %p.4 = getelementptr inbounds i8, ptr null, i64 1
diff --git a/polly/test/CodeGen/read-only-scalars.ll b/polly/test/CodeGen/read-only-scalars.ll
index 365cbbce495fb..2ae0f9e797bd1 100644
--- a/polly/test/CodeGen/read-only-scalars.ll
+++ b/polly/test/CodeGen/read-only-scalars.ll
@@ -1,9 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false -passes=polly-codegen \
-; RUN:     \
-; RUN:     -S < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true -passes=polly-codegen \
-; RUN:     \
-; RUN:     -S < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SCALAR
 
 ; CHECK-NOT: alloca
 
diff --git a/polly/test/CodeGen/reduction.ll b/polly/test/CodeGen/reduction.ll
index 8c5f70770a1c5..21d8c0f98b702 100644
--- a/polly/test/CodeGen/reduction.ll
+++ b/polly/test/CodeGen/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | not FileCheck %s
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll
index 060a1866870e4..f9576826b4f77 100644
--- a/polly/test/CodeGen/reduction_2.ll
+++ b/polly/test/CodeGen/reduction_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --allow-empty
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_simple_binary.ll b/polly/test/CodeGen/reduction_simple_binary.ll
index 0fe1085dbbacd..53cbdf407c954 100644
--- a/polly/test/CodeGen/reduction_simple_binary.ll
+++ b/polly/test/CodeGen/reduction_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: pragma simd reduction
 ;
diff --git a/polly/test/CodeGen/reggen_domtree_crash.ll b/polly/test/CodeGen/reggen_domtree_crash.ll
index 58c27091a22c3..9d5ba4c4ff9fb 100644
--- a/polly/test/CodeGen/reggen_domtree_crash.ll
+++ b/polly/test/CodeGen/reggen_domtree_crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s
 
 ; CHECK: define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2)
 
diff --git a/polly/test/CodeGen/region-with-instructions.ll b/polly/test/CodeGen/region-with-instructions.ll
index e5f7d0f9ef5d6..f061ac061e226 100644
--- a/polly/test/CodeGen/region-with-instructions.ll
+++ b/polly/test/CodeGen/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; CHECK-LABEL:   polly.stmt.bb48:
 ; CHECK-NEXT:   %[[offset:.*]] = shl i64 %polly.indvar, 3
diff --git a/polly/test/CodeGen/region_exiting-domtree.ll b/polly/test/CodeGen/region_exiting-domtree.ll
index 06e0d9df3d951..16b265c064790 100644
--- a/polly/test/CodeGen/region_exiting-domtree.ll
+++ b/polly/test/CodeGen/region_exiting-domtree.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s
 
 ; Verify that the DominatorTree is preserved correctly for the inserted
 ; %polly.stmt.exit.exit block, which serves as new exit block for the generated
diff --git a/polly/test/CodeGen/region_multiexit_partialwrite.ll b/polly/test/CodeGen/region_multiexit_partialwrite.ll
index 39e04dbf93ac7..9d21d16c9f9cd 100644
--- a/polly/test/CodeGen/region_multiexit_partialwrite.ll
+++ b/polly/test/CodeGen/region_multiexit_partialwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; This text case has a partial write of PHI in a region-statement. It
 ; requires that the new PHINode from the region's exiting block is
diff --git a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
index 4afaab5bbad0a..7984b7ce80209 100644
--- a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
+++ b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; TODO: FIXME: Simplify the context.
 ; AST: if (n >= 1 && 0 == n <= -1)
diff --git a/polly/test/CodeGen/run-time-condition.ll b/polly/test/CodeGen/run-time-condition.ll
index 914b76f5e0be7..44d2a4f15b378 100644
--- a/polly/test/CodeGen/run-time-condition.ll
+++ b/polly/test/CodeGen/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
index 77306c1046133..102ef04128133 100644
--- a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
+++ b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; Test the code generation in the presence of a scalar out-of-scop value being
 ; used from within the SCoP.
diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll
index 0c1164b245a43..1988f77086c8a 100644
--- a/polly/test/CodeGen/scalar-store-from-same-bb.ll
+++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly \
-; RUN: -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; This test ensures that the expression N + 1 that is stored in the phi-node
 ; alloca, is directly computed and not incorrectly transferred through memory.
diff --git a/polly/test/CodeGen/scalar_codegen_crash.ll b/polly/test/CodeGen/scalar_codegen_crash.ll
index 375f097283b07..0179072391a33 100644
--- a/polly/test/CodeGen/scalar_codegen_crash.ll
+++ b/polly/test/CodeGen/scalar_codegen_crash.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly \
-; RUN:     -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; This test cases used to crash the scalar code generation. Check that we
 ; can generate code for it.
diff --git a/polly/test/CodeGen/scev-backedgetaken.ll b/polly/test/CodeGen/scev-backedgetaken.ll
index e0941690ae489..09fcfe3e4a09c 100644
--- a/polly/test/CodeGen/scev-backedgetaken.ll
+++ b/polly/test/CodeGen/scev-backedgetaken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR48422
 ; Use of ScalarEvolution in Codegen not possible because DominatorTree is not updated.
diff --git a/polly/test/CodeGen/scev-division-invariant-load.ll b/polly/test/CodeGen/scev-division-invariant-load.ll
index 70f090eae07b3..5942ecbe7cee9 100644
--- a/polly/test/CodeGen/scev-division-invariant-load.ll
+++ b/polly/test/CodeGen/scev-division-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s
 ;
 ; Check that we generate valid code as we did not use the preloaded
 ; value of %tmp1 for the access function of the preloaded %tmp4.
diff --git a/polly/test/CodeGen/scev.ll b/polly/test/CodeGen/scev.ll
index e2b5afda1bfff..a09d8c5504b1b 100644
--- a/polly/test/CodeGen/scev.ll
+++ b/polly/test/CodeGen/scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @f () inlinehint align 2 {
diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
index 0adb0ba7eea81..095c362024a83 100644
--- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
+++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; bugpoint-reduced testcase of MiBench/consumer-lame/quantize-pvt.c from the
 ; test-suite.
diff --git a/polly/test/CodeGen/scev_looking_through_bitcasts.ll b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
index 142e83f820fe7..81f4b96d22a37 100644
--- a/polly/test/CodeGen/scev_looking_through_bitcasts.ll
+++ b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Scalar write of bitcasted value. Instead of writing %b of type
 ; %structty, the SCEV expression looks through the bitcast such that
diff --git a/polly/test/CodeGen/scop_expander_insert_point.ll b/polly/test/CodeGen/scop_expander_insert_point.ll
index fd73132258ddc..1cba7567a5e43 100644
--- a/polly/test/CodeGen/scop_expander_insert_point.ll
+++ b/polly/test/CodeGen/scop_expander_insert_point.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; CHECK:      entry:
 ; CHECK-NEXT:   %outvalue.141.phiops = alloca i64
diff --git a/polly/test/CodeGen/scop_expander_segfault.ll b/polly/test/CodeGen/scop_expander_segfault.ll
index d94a1fdfb2c12..56d37a0175853 100644
--- a/polly/test/CodeGen/scop_expander_segfault.ll
+++ b/polly/test/CodeGen/scop_expander_segfault.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S %s | FileCheck %s
 ;
 ; This test was extracted from gcc in SPEC2006 and it crashed our code
 ; generation, or to be more precise, the ScopExpander due to a endless
diff --git a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
index 9f968e5657c90..cdcfe838fa915 100644
--- a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
+++ b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; Verify that we generate the runtime check code after the conditional branch
 ; in the SCoP region entering block (here %entry).
diff --git a/polly/test/CodeGen/select-base-pointer.ll b/polly/test/CodeGen/select-base-pointer.ll
index 85be37755c474..144c05b5effba 100644
--- a/polly/test/CodeGen/select-base-pointer.ll
+++ b/polly/test/CodeGen/select-base-pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s
 ;
 ; Check that we do not crash here.
 ;
diff --git a/polly/test/CodeGen/sequential_loops.ll b/polly/test/CodeGen/sequential_loops.ll
index 33a3ee9fbbd47..eeb3048007859 100644
--- a/polly/test/CodeGen/sequential_loops.ll
+++ b/polly/test/CodeGen/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit.ll b/polly/test/CodeGen/simple_loop_non_single_exit.ll
index a7e36bc4c7330..1b3518bdb0cba 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
index 22e9da09ef857..3af9913e6aa04 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_non_single_entry.ll b/polly/test/CodeGen/simple_non_single_entry.ll
index c33a77ae07939..8800dc7214b06 100644
--- a/polly/test/CodeGen/simple_non_single_entry.ll
+++ b/polly/test/CodeGen/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_nonaffine_loop.ll b/polly/test/CodeGen/simple_nonaffine_loop.ll
index bc62047a80a34..5b1cd1991cd73 100644
--- a/polly/test/CodeGen/simple_nonaffine_loop.ll
+++ b/polly/test/CodeGen/simple_nonaffine_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 
 ;#include <stdio.h>
 ;#include <stdlib.h>
diff --git a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
index a65e3a25f035a..f0142f726efa4 100644
--- a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
index acccb48f18a3c..cc5e7b221026c 100644
--- a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;define N 20
diff --git a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
index 7a67f6ba96ce2..1299362369478 100644
--- a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_one_iteration.ll b/polly/test/CodeGen/single_do_loop_one_iteration.ll
index 2d939167b71ee..d025ef2116a40 100644
--- a/polly/test/CodeGen/single_do_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_do_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#define N 20
diff --git a/polly/test/CodeGen/single_do_loop_scev_replace.ll b/polly/test/CodeGen/single_do_loop_scev_replace.ll
index 83c9e9d0324ce..b473e266343a3 100644
--- a/polly/test/CodeGen/single_do_loop_scev_replace.ll
+++ b/polly/test/CodeGen/single_do_loop_scev_replace.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop.ll b/polly/test/CodeGen/single_loop.ll
index 2db34663e93ce..c04738e6843a0 100644
--- a/polly/test/CodeGen/single_loop.ll
+++ b/polly/test/CodeGen/single_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/single_loop_int_max_iterations.ll b/polly/test/CodeGen/single_loop_int_max_iterations.ll
index f83e8823c63df..82ec7ffd85462 100644
--- a/polly/test/CodeGen/single_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
index 1427189d74a7d..8affb71fad649 100644
--- a/polly/test/CodeGen/single_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#include "limits.h"
 ;#define N 20
diff --git a/polly/test/CodeGen/single_loop_one_iteration.ll b/polly/test/CodeGen/single_loop_one_iteration.ll
index 1a70d4a879d83..307b8358ff980 100644
--- a/polly/test/CodeGen/single_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/single_loop_param.ll b/polly/test/CodeGen/single_loop_param.ll
index 44ce1236e9f84..1d78c7a7329d4 100644
--- a/polly/test/CodeGen/single_loop_param.ll
+++ b/polly/test/CodeGen/single_loop_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer, align 16 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/single_loop_param_less_equal.ll b/polly/test/CodeGen/single_loop_param_less_equal.ll
index fda9bfab11b8f..5fad1d43ae0d7 100644
--- a/polly/test/CodeGen/single_loop_param_less_equal.ll
+++ b/polly/test/CodeGen/single_loop_param_less_equal.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
-; RUN: opt %loadNPMPolly -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_param_less_than.ll b/polly/test/CodeGen/single_loop_param_less_than.ll
index b888c860eacd0..75a8cb2094a16 100644
--- a/polly/test/CodeGen/single_loop_param_less_than.ll
+++ b/polly/test/CodeGen/single_loop_param_less_than.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_zero_iterations.ll b/polly/test/CodeGen/single_loop_zero_iterations.ll
index b1ce491b5c8a2..3194dba52190b 100644
--- a/polly/test/CodeGen/single_loop_zero_iterations.ll
+++ b/polly/test/CodeGen/single_loop_zero_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/split_edge_of_exit.ll b/polly/test/CodeGen/split_edge_of_exit.ll
index f4b17e687ada6..73d6006a6b621 100644
--- a/polly/test/CodeGen/split_edge_of_exit.ll
+++ b/polly/test/CodeGen/split_edge_of_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -disable-output < %s
 ;
 ; This is a scop directly precedented by a region, i.e. the scop's entry is the
 ; region's exit block. This test is to ensure that the RegionInfo is correctly
diff --git a/polly/test/CodeGen/split_edges.ll b/polly/test/CodeGen/split_edges.ll
index b921202285bb2..03363f49ce800 100644
--- a/polly/test/CodeGen/split_edges.ll
+++ b/polly/test/CodeGen/split_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1536 x float] zeroinitializer
diff --git a/polly/test/CodeGen/split_edges_2.ll b/polly/test/CodeGen/split_edges_2.ll
index 8f4d48f5dcb00..59df1618cfd71 100644
--- a/polly/test/CodeGen/split_edges_2.ll
+++ b/polly/test/CodeGen/split_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/srem-in-other-bb.ll b/polly/test/CodeGen/srem-in-other-bb.ll
index a13a1b6ab98f2..177d86adb9066 100644
--- a/polly/test/CodeGen/srem-in-other-bb.ll
+++ b/polly/test/CodeGen/srem-in-other-bb.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN:     < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
index b49c4e12fe11a..5a490b68b9a9f 100644
--- a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
+++ b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -verify-dom-info -passes=polly-codegen -S < %s \
-; RUN: -polly-invariant-load-hoisting=true | FileCheck %s
+; RUN: opt %loadNPMPolly -verify-dom-info '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This caused an infinite recursion during invariant load hoisting at some
 ; point. Check it does not and we add a "false" runtime check.
diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll
index bb878cc342af8..d41e4a87bfb65 100644
--- a/polly/test/CodeGen/stmt_split_no_dependence.ll
+++ b/polly/test/CodeGen/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; CHECK:   store i32 %9, ptr %scevgep, align 4, !alias.scope !3, !noalias !6
 ; CHECK:   store i32 %11, ptr %scevgep4, align 4, !alias.scope !6, !noalias !3
diff --git a/polly/test/CodeGen/switch-in-non-affine-region.ll b/polly/test/CodeGen/switch-in-non-affine-region.ll
index 1a9e7081bebdc..6696efca63f02 100644
--- a/polly/test/CodeGen/switch-in-non-affine-region.ll
+++ b/polly/test/CodeGen/switch-in-non-affine-region.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly \
-; RUN: -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
index b2a062363eef4..86395f25db1a8 100644
--- a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
+++ b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Check for the correct written value of a scalar phi write whose value is
 ; defined within the loop, but its effective value is its last definition when
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
index 5668063c27c8e..b5172badd76dc 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen -verify-loop-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -verify-loop-info < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/bugs/show_bug.cgi?id=21167
 ;
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll
index fdc98fbb4d9e7..39cadc78f7e36 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/PR21167
 ;
diff --git a/polly/test/CodeGen/test.ll b/polly/test/CodeGen/test.ll
index aad998ba2728b..7c28ca4860e79 100644
--- a/polly/test/CodeGen/test.ll
+++ b/polly/test/CodeGen/test.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;int bar1();
diff --git a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
index 1c68389eaeba8..d97a632fc382e 100644
--- a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
+++ b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; CHECK:       polly.merge_new_and_old:
 ; CHECK-NEXT:    merge = phi
diff --git a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
index 4396c38310dce..845d106d43b0e 100644
--- a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
+++ b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: for.cond:
 ; CHECK:         %num.0 = phi i32 [ %add, %for.body15 ], [ 0, %for.cond.pre_entry_bb ]
diff --git a/polly/test/CodeGen/two-scops-in-row.ll b/polly/test/CodeGen/two-scops-in-row.ll
index dd3f310ef1502..4b9d49cb02ec6 100644
--- a/polly/test/CodeGen/two-scops-in-row.ll
+++ b/polly/test/CodeGen/two-scops-in-row.ll
@@ -1,6 +1,6 @@
 
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -disable-output < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; SCALAR: if (
diff --git a/polly/test/CodeGen/udiv_expansion_position.ll b/polly/test/CodeGen/udiv_expansion_position.ll
index 354e3cd180107..2a3ba8ae45757 100644
--- a/polly/test/CodeGen/udiv_expansion_position.ll
+++ b/polly/test/CodeGen/udiv_expansion_position.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 ;
 ; Verify we do not crash when we synthezise code for the udiv in the SCoP.
 ;
diff --git a/polly/test/CodeGen/uninitialized_scalar_memory.ll b/polly/test/CodeGen/uninitialized_scalar_memory.ll
index e08af07e604e8..ad0e6ca7e350b 100644
--- a/polly/test/CodeGen/uninitialized_scalar_memory.ll
+++ b/polly/test/CodeGen/uninitialized_scalar_memory.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
 ;
 ; Verify we initialize the scalar locations reserved for the incoming phi
 ; values.
diff --git a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
index 46706804a81b0..e7f4d601edab5 100644
--- a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
+++ b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; The loop for.body is a scop with invariant load hoisting, but does not
 ; terminate predictably for ScalarEvolution. The scalar %1 therefore is not
diff --git a/polly/test/CodeGen/variant_load_empty_domain.ll b/polly/test/CodeGen/variant_load_empty_domain.ll
index 6f2d3dc582db3..d1f4450d086e0 100644
--- a/polly/test/CodeGen/variant_load_empty_domain.ll
+++ b/polly/test/CodeGen/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
index b342b1cb5aa27..44f6dbcd34d1d 100644
--- a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
+++ b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly \
-; RUN: -passes=polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 ;    int /* pure */ g()
diff --git a/polly/test/DeLICM/confused_order.ll b/polly/test/DeLICM/confused_order.ll
index 0c19eb6aa605a..de340ef48d16e 100644
--- a/polly/test/DeLICM/confused_order.ll
+++ b/polly/test/DeLICM/confused_order.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-delicm>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-delicm' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
 ;
 ; ForwardOptree changes the SCoP and may already map some accesses.
 ; DeLICM must be prepared to encounter implicit reads
diff --git a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
index 66d9ae889e657..ba42692febab2 100644
--- a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
+++ b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; The domain of bb14 contradicts the SCoP's assumptions. This leads to
 ; 'anything goes' inside the statement since it is never executed,
diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll
index a78a4691bb0d5..19cc334f70054 100644
--- a/polly/test/DeLICM/load-in-cond-inf-loop.ll
+++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 
 ; When %b is 0, %for.body13 is an infinite loop. In this case the loaded
 ; value %1 is not used anywhere.
diff --git a/polly/test/DeLICM/map_memset_zero.ll b/polly/test/DeLICM/map_memset_zero.ll
index 9a8e5989fdad1..cc4e0ab387d2a 100644
--- a/polly/test/DeLICM/map_memset_zero.ll
+++ b/polly/test/DeLICM/map_memset_zero.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Check that PHI mapping works even in presence of a memset whose'
 ; zero value is used.
diff --git a/polly/test/DeLICM/nomap_alreadymapped.ll b/polly/test/DeLICM/nomap_alreadymapped.ll
index da5f4ec24a47e..9e49300381b57 100644
--- a/polly/test/DeLICM/nomap_alreadymapped.ll
+++ b/polly/test/DeLICM/nomap_alreadymapped.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_escaping.ll b/polly/test/DeLICM/nomap_escaping.ll
index 60955368fe59c..6460dbdb808fb 100644
--- a/polly/test/DeLICM/nomap_escaping.ll
+++ b/polly/test/DeLICM/nomap_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_occupied.ll b/polly/test/DeLICM/nomap_occupied.ll
index 9ba8ce2641231..72eea57b8fdf5 100644
--- a/polly/test/DeLICM/nomap_occupied.ll
+++ b/polly/test/DeLICM/nomap_occupied.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_readonly.ll b/polly/test/DeLICM/nomap_readonly.ll
index 7a185d336bad3..67bac06f1505f 100644
--- a/polly/test/DeLICM/nomap_readonly.ll
+++ b/polly/test/DeLICM/nomap_readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      fsomeval = 21.0 + 21.0;
diff --git a/polly/test/DeLICM/nomap_spuriouswrite.ll b/polly/test/DeLICM/nomap_spuriouswrite.ll
index 0ed7f6ee8e239..f3fcb0ccd06e4 100644
--- a/polly/test/DeLICM/nomap_spuriouswrite.ll
+++ b/polly/test/DeLICM/nomap_spuriouswrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_storagesize.ll b/polly/test/DeLICM/nomap_storagesize.ll
index bf851ac342d20..0f2943a5b1417 100644
--- a/polly/test/DeLICM/nomap_storagesize.ll
+++ b/polly/test/DeLICM/nomap_storagesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(float *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_writewrite.ll b/polly/test/DeLICM/nomap_writewrite.ll
index 9fcd52aad743c..fc8459a34972c 100644
--- a/polly/test/DeLICM/nomap_writewrite.ll
+++ b/polly/test/DeLICM/nomap_writewrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/outofquota-reverseDomain.ll b/polly/test/DeLICM/outofquota-reverseDomain.ll
index 1f7527c841208..d48665bdc29c1 100644
--- a/polly/test/DeLICM/outofquota-reverseDomain.ll
+++ b/polly/test/DeLICM/outofquota-reverseDomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; This causes an assertion to fail on out-of-quota after 1000000 operations.
 ; (The error was specific to -polly-delicm-max-ops=1000000 and changes
diff --git a/polly/test/DeLICM/pass_existence.ll b/polly/test/DeLICM/pass_existence.ll
index 64302d9983261..d784655db60f3 100644
--- a/polly/test/DeLICM/pass_existence.ll
+++ b/polly/test/DeLICM/pass_existence.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -passes=polly-delicm -disable-output < %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=scop(print<polly-delicm>)' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Simple test for the existence of the DeLICM pass.
 ;
diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll
index 2a92503809a24..82799e4fd1ab8 100644
--- a/polly/test/DeLICM/pr41656.ll
+++ b/polly/test/DeLICM/pr41656.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR41656
 ;
diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll
index deba8bfcc5daf..10f8b64c3dd2f 100644
--- a/polly/test/DeLICM/pr48783.ll
+++ b/polly/test/DeLICM/pr48783.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR48783
 ;
diff --git a/polly/test/DeLICM/reduction.ll b/polly/test/DeLICM/reduction.ll
index 29b7a3617300b..5d6531f51d570 100644
--- a/polly/test/DeLICM/reduction.ll
+++ b/polly/test/DeLICM/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_constant_selfconflict.ll b/polly/test/DeLICM/reduction_constant_selfconflict.ll
index 012e0a0794b2b..223a429d76343 100644
--- a/polly/test/DeLICM/reduction_constant_selfconflict.ll
+++ b/polly/test/DeLICM/reduction_constant_selfconflict.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate.ll b/polly/test/DeLICM/reduction_looprotate.ll
index 341cc091f7e18..b8eefe5e57cf8 100644
--- a/polly/test/DeLICM/reduction_looprotate.ll
+++ b/polly/test/DeLICM/reduction_looprotate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll
index a58eabb4fbd82..627a4452c3f90 100644
--- a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll
+++ b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Verify that delicm can cope with never taken PHI incoming edges.
 ; The edge %body -> %body_phi is never taken, hence the access MemoryKind::PHI,
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll
index 5a81441cf0eea..1d3a789f7ce07 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
index d9c5268e631df..37499cd73020f 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction only over some iterations.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
index 6a4223f5af655..79a700ff122e2 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction not written in all iterations.
 ; FIXME: %join is not mapped because the MemoryKind::Value mapping does not
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
index bf4b8018d5526..7e82daa9f80fc 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Hosted reduction load (but not the store) without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_hoisted.ll b/polly/test/DeLICM/reduction_looprotate_hoisted.ll
index 795b94912aa42..7dc6e0fa9e408 100644
--- a/polly/test/DeLICM/reduction_looprotate_hoisted.ll
+++ b/polly/test/DeLICM/reduction_looprotate_hoisted.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A, int* StartPtr) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_licm.ll b/polly/test/DeLICM/reduction_looprotate_licm.ll
index 935f31abced30..a9c55a8f54087 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_licm2.ll b/polly/test/DeLICM/reduction_looprotate_licm2.ll
index 8b06e7466f20a..b98950b71bc85 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Use %phi instead of the normal %add; that is, the last last iteration will
 ; be ignored such the %phi cannot be written to A[3] in %body.
diff --git a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll
index 51bb7291a73ed..4424d904b607d 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll
@@ -1,7 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule \
-; RUN: -polly-delicm-overapproximate-writes=true \
-; RUN: -polly-delicm-compute-known=true -polly-print-delicm \
-; RUN: -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Make sure delicm works even in case two stores that store the same value.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
index 027df44e86193..7d20b8d5c7cbf 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Register-promoted reduction but without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_load.ll b/polly/test/DeLICM/reduction_looprotate_load.ll
index 6aa83ae195031..e288a86f30719 100644
--- a/polly/test/DeLICM/reduction_looprotate_load.ll
+++ b/polly/test/DeLICM/reduction_looprotate_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A, double* StartPtr) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
index 4ea3fa53a339a..4582f0a36eb5c 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all. Load hoisted before loop.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
index 2e7abe444ad65..7df2885e01339 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
index 60afdeb5fc97e..a1bd5d3f90fe7 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not written to.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
index e63b457de92db..8329a85ecf13b 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not accessed.
diff --git a/polly/test/DeLICM/reduction_looprotate_readonly.ll b/polly/test/DeLICM/reduction_looprotate_readonly.ll
index a9535467b3bde..5227f42ae4824 100644
--- a/polly/test/DeLICM/reduction_looprotate_readonly.ll
+++ b/polly/test/DeLICM/reduction_looprotate_readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A, double Start) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll
index 3d486910c8612..77d823c8ef6d5 100644
--- a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll
+++ b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_undef.ll b/polly/test/DeLICM/reduction_looprotate_undef.ll
index 8c0544ed77852..f70df6075c2d3 100644
--- a/polly/test/DeLICM/reduction_looprotate_undef.ll
+++ b/polly/test/DeLICM/reduction_looprotate_undef.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_overapproximate.ll b/polly/test/DeLICM/reduction_overapproximate.ll
index 2d33d3a0ece2a..d6cbb70a84a4a 100644
--- a/polly/test/DeLICM/reduction_overapproximate.ll
+++ b/polly/test/DeLICM/reduction_overapproximate.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false  -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL
 ;
 ;    void func(double *A {
 ;      for (int j = -1; j < 3; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_preheader.ll b/polly/test/DeLICM/reduction_preheader.ll
index c6e3643797c04..f3ce58b1bc954 100644
--- a/polly/test/DeLICM/reduction_preheader.ll
+++ b/polly/test/DeLICM/reduction_preheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_unrelatedunusual.ll b/polly/test/DeLICM/reduction_unrelatedunusual.ll
index 97826f603e5d4..542cec71ab855 100644
--- a/polly/test/DeLICM/reduction_unrelatedunusual.ll
+++ b/polly/test/DeLICM/reduction_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Map %add and %phi to A[j].
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/DeLICM/reject_loadafterstore.ll b/polly/test/DeLICM/reject_loadafterstore.ll
index 4460620852a85..d56b237aa71d9 100644
--- a/polly/test/DeLICM/reject_loadafterstore.ll
+++ b/polly/test/DeLICM/reject_loadafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll
index 9bc6bf1f23733..9b7f8e5f97af3 100644
--- a/polly/test/DeLICM/reject_outofquota.ll
+++ b/polly/test/DeLICM/reject_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-delicm,print<polly-dependences>' -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps;delicm>' -polly-print-deps -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeafterstore.ll b/polly/test/DeLICM/reject_storeafterstore.ll
index ddd13dad2ed31..0fea4d7bb3960 100644
--- a/polly/test/DeLICM/reject_storeafterstore.ll
+++ b/polly/test/DeLICM/reject_storeafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeinsubregion.ll b/polly/test/DeLICM/reject_storeinsubregion.ll
index c987156b51cd1..0b75c16495c5c 100644
--- a/polly/test/DeLICM/reject_storeinsubregion.ll
+++ b/polly/test/DeLICM/reject_storeinsubregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_unusualstore.ll b/polly/test/DeLICM/reject_unusualstore.ll
index 342888c6654f4..311a7351c955b 100644
--- a/polly/test/DeLICM/reject_unusualstore.ll
+++ b/polly/test/DeLICM/reject_unusualstore.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ;    void func(double *A) {
diff --git a/polly/test/DeLICM/skip_maywrite.ll b/polly/test/DeLICM/skip_maywrite.ll
index 0d30791cd94e7..14de2b9d0bf84 100644
--- a/polly/test/DeLICM/skip_maywrite.ll
+++ b/polly/test/DeLICM/skip_maywrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/skip_multiaccess.ll b/polly/test/DeLICM/skip_multiaccess.ll
index a7c79f7524630..a213a91343f3d 100644
--- a/polly/test/DeLICM/skip_multiaccess.ll
+++ b/polly/test/DeLICM/skip_multiaccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR34485
 ; llvm.org/PR34989
diff --git a/polly/test/DeLICM/skip_notinloop.ll b/polly/test/DeLICM/skip_notinloop.ll
index 8e265e19aefea..3a2dede210083 100644
--- a/polly/test/DeLICM/skip_notinloop.ll
+++ b/polly/test/DeLICM/skip_notinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      double phi = 0.0;
diff --git a/polly/test/DeLICM/skip_scalaraccess.ll b/polly/test/DeLICM/skip_scalaraccess.ll
index 2cf13afe11cdf..a0ed9f76a8ca2 100644
--- a/polly/test/DeLICM/skip_scalaraccess.ll
+++ b/polly/test/DeLICM/skip_scalaraccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeadCodeElimination/chained_iterations.ll b/polly/test/DeadCodeElimination/chained_iterations.ll
index f3bf07bb40d83..f1e47075e2f74 100644
--- a/polly/test/DeadCodeElimination/chained_iterations.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/chained_iterations_2.ll b/polly/test/DeadCodeElimination/chained_iterations_2.ll
index 52f034f0e56ca..6ecc07c0f7d21 100644
--- a/polly/test/DeadCodeElimination/chained_iterations_2.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/computeout.ll b/polly/test/DeadCodeElimination/computeout.ll
index e54df42ed1db0..b43142be2a5c8 100644
--- a/polly/test/DeadCodeElimination/computeout.ll
+++ b/polly/test/DeadCodeElimination/computeout.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
index c102f60abb659..85eea91f99207 100644
--- a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
+++ b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
index 36f55476fed23..21b7c5cf9583b 100644
--- a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
+++ b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/DeadCodeElimination/non-affine.ll b/polly/test/DeadCodeElimination/non-affine.ll
index ef528b4124c66..86cabe6501393 100644
--- a/polly/test/DeadCodeElimination/non-affine.ll
+++ b/polly/test/DeadCodeElimination/non-affine.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ;
diff --git a/polly/test/DeadCodeElimination/null_schedule.ll b/polly/test/DeadCodeElimination/null_schedule.ll
index 01d34e95629ba..507d690144e01 100644
--- a/polly/test/DeadCodeElimination/null_schedule.ll
+++ b/polly/test/DeadCodeElimination/null_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; A[0] = 1;
 ;
diff --git a/polly/test/DependenceInfo/computeout.ll b/polly/test/DependenceInfo/computeout.ll
index c2a3456b3dc80..3fdc4008f8474 100644
--- a/polly/test/DependenceInfo/computeout.ll
+++ b/polly/test/DependenceInfo/computeout.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/different_schedule_dimensions.ll b/polly/test/DependenceInfo/different_schedule_dimensions.ll
index f89791f42f9db..69274f11f567f 100644
--- a/polly/test/DependenceInfo/different_schedule_dimensions.ll
+++ b/polly/test/DependenceInfo/different_schedule_dimensions.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' \
-; RUN:                   -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 
 ; CHECK: RAW dependences:
 ; CHECK:   { Stmt_bb9[0] -> Stmt_bb10[0] }
diff --git a/polly/test/DependenceInfo/do_pluto_matmult.ll b/polly/test/DependenceInfo/do_pluto_matmult.ll
index b88cf9bf5475c..2a0027bbc034b 100644
--- a/polly/test/DependenceInfo/do_pluto_matmult.ll
+++ b/polly/test/DependenceInfo/do_pluto_matmult.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll
index 5abbf48136891..06a196822c832 100644
--- a/polly/test/DependenceInfo/fine_grain_dep_0.ll
+++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
 
 ; REF:      RAW dependences:
 ; REF-NEXT:     [N] -> { [Stmt_for_body[i0] -> MemRef_b[]] -> [Stmt_for_body[6 + i0] -> MemRef_b[]] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[6 + i0] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[4 + i0] : 0 <= i0 <= -11 + N; [Stmt_for_body[i0] -> MemRef_a[]] -> [Stmt_for_body[4 + i0] -> MemRef_a[]] : 0 <= i0 <= -11 + N }
diff --git a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
index 677323495476b..9875257694331 100644
--- a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
+++ b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;                     for (int i = 0; i < N; i++) {
diff --git a/polly/test/DependenceInfo/infeasible_context.ll b/polly/test/DependenceInfo/infeasible_context.ll
index cde3102dc3dc9..c9473e614e362 100644
--- a/polly/test/DependenceInfo/infeasible_context.ll
+++ b/polly/test/DependenceInfo/infeasible_context.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  | FileCheck %s -check-prefix=FUNC-SCOP
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(print<polly-dependences>)' -disable-output < %s 2>&1 \
-; RUN:  | FileCheck %s -check-prefix=FUNC-DEPS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-SCOP
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-detect -polly-print-deps -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-DEPS
 ;
 ; FUNC-SCOP-NOT: Statement
 ; FUNC-DEPS-NOT: RAW dependences
diff --git a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
index 392a34769cddb..92e6cb89b2a27 100644
--- a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
+++ b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; Verify that the presence of a may-write (S1) between a read (S0) and a
 ; must-write (S2) does not block the generation of RAW dependences. This makes
diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
index ae5fd3beed399..b14759725dde0 100644
--- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
+++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK:        MayWriteAccess :=   [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/DependenceInfo/reduction_complex_location.ll b/polly/test/DependenceInfo/reduction_complex_location.ll
index 7722ee974c3fa..45789088e57e4 100644
--- a/polly/test/DependenceInfo/reduction_complex_location.ll
+++ b/polly/test/DependenceInfo/reduction_complex_location.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 840d1f32dca39..7923975118bb9 100644
--- a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we keep the dependences of A.
diff --git a/polly/test/DependenceInfo/reduction_dependences_not_null.ll b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
index 56d84a9aec6d6..fdcd5f311800d 100644
--- a/polly/test/DependenceInfo/reduction_dependences_not_null.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
@@ -1,7 +1,7 @@
 ; Test that the reduction dependences are always initialised, even in a case
 ; where we have no reduction. If this object is NULL, then isl operations on
 ; it will fail.
-; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll
index 3b4bd9ef04b5a..13675ada39b0e 100644
--- a/polly/test/DependenceInfo/reduction_indirect_access.ll
+++ b/polly/test/DependenceInfo/reduction_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ; CHECK: Reduction dependences:
 ; CHECK:   [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N }
diff --git a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
index 76c7fc64ae89c..e6ce425719ca9 100644
--- a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0 + i1, o1] : i0 >= 0 and 0 <= i1 <= 1023 - i0 and i1 <= 1 and 0 < o1 <= 511 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
index 02b814a0d7c04..820371937a582 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
@@ -1,6 +1,6 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the inner reduction like accesses cause reduction dependences
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
index 91bd35deebd06..9792f791c6989 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
index 040d513782392..9bde285c64516 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     { Stmt_for_inc[i0, i1] -> Stmt_for_inc[i0, 1 + i1] : 0 <= i0 <= 99 and 0 <= i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions.ll b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
index 527a8cfc3556e..ac3adb9065462 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not have dependences between the if and the else clause
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
index fb5fd96a2e426..16ca85bff9502 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ;
 ; These are the important RAW dependences, as they need to originate/end in only one iteration:
diff --git a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
index 3ec3920268b49..de506a39485cc 100644
--- a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; FIXME: Change the comment once we allow different pointers
 ; The statement is "almost" reduction like but should not yield any reduction dependences
diff --git a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
index 23bd8ef25bd7a..fbf1409a1ba30 100644
--- a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
+++ b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0, 1 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and 1024 - N + i0 <= i1 <= 1022 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps.ll b/polly/test/DependenceInfo/reduction_privatization_deps.ll
index 0e0f71737ffd3..0d66f885cd42d 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[-1 + i0 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and -i0 < i1 <= 1024 - i0 and i1 <= 1023; Stmt_S0[i0] -> Stmt_S1[o0, i0 - o0] : i0 <= 1023 and 0 <= o0 <= i0 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
index cafa319e2cc7b..81235d6cf02e4 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; We have privatization dependences from a textually later statement to a
 ; textually earlier one, but the dependences still go forward in time.
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
index d86da92fbcab8..6b48ab5afd155 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S3[2 + i0] : 0 <= i0 <= 96; Stmt_S2[i0, i1] -> Stmt_S3[o0] : i1 <= 1 - i0 and -i1 < o0 <= 1 and o0 <= 1 + i0 - i1; Stmt_S3[i0] -> Stmt_S2[o0, 1 - i0] : 0 <= i0 <= 1 and i0 < o0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
index d84c04fc309b0..1fef004c4c47a 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S2[i0, i0] : 0 <= i0 <= 98; Stmt_S2[i0, i0] -> Stmt_S3[i0] : 0 <= i0 <= 98; Stmt_S3[i0] -> Stmt_S2[o0, i0] : i0 >= 0 and i0 < o0 <= 98; Stmt_S2[i0, i1] -> Stmt_S1[i1] : i0 >= 0 and i0 < i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
index 592c7238c3c59..f40a7c07a3ba4 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, 0] -> Stmt_S2[i0, 0] : 0 <= i0 <= 98; Stmt_S2[i0, 0] -> Stmt_S1[1 + i0, 0] : 0 <= i0 <= 97 }
diff --git a/polly/test/DependenceInfo/reduction_sequence.ll b/polly/test/DependenceInfo/reduction_sequence.ll
index 7ce9d37d395bb..d881a99adc226 100644
--- a/polly/test/DependenceInfo/reduction_sequence.ll
+++ b/polly/test/DependenceInfo/reduction_sequence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 
 ;    void manyreductions(long *A) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/DependenceInfo/reduction_simple_iv.ll b/polly/test/DependenceInfo/reduction_simple_iv.ll
index d13d14ecaad92..b811d1593ab02 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
index 4c97fbb1aacb7..0a5d36f9b9f79 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
index 804005cf72a72..90f9d76ef57b2 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[i0] : 0 <= i0 <= 99 and 0 <= i1 <= 99; Stmt_S0[i0] -> Stmt_S1[i0, o1] : 0 <= i0 <= 99 and 0 <= o1 <= 99; Stmt_S2[i0] -> Stmt_S0[1 + i0] : 0 <= i0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
index 9596827b4cbbf..2b194bbb51988 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_S1[i0] -> Stmt_S2[] : N >= 11 and 0 <= i0 <= 1023; Stmt_S0[] -> Stmt_S1[o0] : N >= 11 and 0 <= o0 <= 1023 }
diff --git a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
index d67683d11a4b3..70d5bdf64059d 100644
--- a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
+++ b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/sequential_loops.ll b/polly/test/DependenceInfo/sequential_loops.ll
index 6ae7200303321..023c2d4f29f37 100644
--- a/polly/test/DependenceInfo/sequential_loops.ll
+++ b/polly/test/DependenceInfo/sequential_loops.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
 
 ; VALUE:      RAW dependences:
 ; VALUE-NEXT:          {  }
diff --git a/polly/test/FlattenSchedule/gemm.ll b/polly/test/FlattenSchedule/gemm.ll
index b20293bd315a3..11dc40599bb0e 100644
--- a/polly/test/FlattenSchedule/gemm.ll
+++ b/polly/test/FlattenSchedule/gemm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-flatten-schedule -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten>' -polly-print-flatten-schedule -disable-output < %s | FileCheck %s
 ;
 ; dgemm kernel
 ; C := alpha*A*B + beta*C
diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll
index 6c81fb12e8cdc..3dfe3fa0aa8e6 100644
--- a/polly/test/ForwardOpTree/atax.ll
+++ b/polly/test/ForwardOpTree/atax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/changed-kind.ll b/polly/test/ForwardOpTree/changed-kind.ll
index b9081f3734044..ec8869da3ae57 100644
--- a/polly/test/ForwardOpTree/changed-kind.ll
+++ b/polly/test/ForwardOpTree/changed-kind.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 
 ; In the code below, %0 is known to be equal to the content of @c (constant 0).
 ; Thus, in order to save a scalar dependency, forward-optree replaces
diff --git a/polly/test/ForwardOpTree/forward_from_region.ll b/polly/test/ForwardOpTree/forward_from_region.ll
index 767a580dccf95..de47bc4df0076 100644
--- a/polly/test/ForwardOpTree/forward_from_region.ll
+++ b/polly/test/ForwardOpTree/forward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions from region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_hoisted.ll b/polly/test/ForwardOpTree/forward_hoisted.ll
index 5d0b0a884b761..39f99545b01ac 100644
--- a/polly/test/ForwardOpTree/forward_hoisted.ll
+++ b/polly/test/ForwardOpTree/forward_hoisted.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify).
 ; This involves making the load-hoisted %val1 to be made available in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_instruction.ll b/polly/test/ForwardOpTree/forward_instruction.ll
index 50a9b07b8a05b..a9f5d3d85ac0a 100644
--- a/polly/test/ForwardOpTree/forward_instruction.ll
+++ b/polly/test/ForwardOpTree/forward_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region.ll b/polly/test/ForwardOpTree/forward_into_region.ll
index ef71b11dc5716..2279a89cfaeb7 100644
--- a/polly/test/ForwardOpTree/forward_into_region.ll
+++ b/polly/test/ForwardOpTree/forward_into_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions to region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
index 1c585446ae63a..f7901e1ccf8fd 100644
--- a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
+++ b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 
 define void @foo(ptr %A, i32 %p, ptr %B) {
diff --git a/polly/test/ForwardOpTree/forward_load.ll b/polly/test/ForwardOpTree/forward_load.ll
index 0bba41833fb19..860e603ef47d2 100644
--- a/polly/test/ForwardOpTree/forward_load.ll
+++ b/polly/test/ForwardOpTree/forward_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-optree>)" -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_differentarray.ll b/polly/test/ForwardOpTree/forward_load_differentarray.ll
index 364bf3ef37133..24b008cfae384 100644
--- a/polly/test/ForwardOpTree/forward_load_differentarray.ll
+++ b/polly/test/ForwardOpTree/forward_load_differentarray.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; To forward %val, B[j] cannot be reused in bodyC because it is overwritten
 ; between. Verify that instead the alternative C[j] is used.
diff --git a/polly/test/ForwardOpTree/forward_load_double_write.ll b/polly/test/ForwardOpTree/forward_load_double_write.ll
index 4c30c7f8da56f..522e803b2d0a0 100644
--- a/polly/test/ForwardOpTree/forward_load_double_write.ll
+++ b/polly/test/ForwardOpTree/forward_load_double_write.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load even in case two writes of identical values are in
 ; one scop statement.
diff --git a/polly/test/ForwardOpTree/forward_load_fromloop.ll b/polly/test/ForwardOpTree/forward_load_fromloop.ll
index 1494e872a8942..5c64221d882b9 100644
--- a/polly/test/ForwardOpTree/forward_load_fromloop.ll
+++ b/polly/test/ForwardOpTree/forward_load_fromloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward a the LoadInst %val into %bodyB. %val is executed multiple times,
 ; we must get the last loaded values.
diff --git a/polly/test/ForwardOpTree/forward_load_indirect.ll b/polly/test/ForwardOpTree/forward_load_indirect.ll
index 51ce94d267277..5b06c357f02ba 100644
--- a/polly/test/ForwardOpTree/forward_load_indirect.ll
+++ b/polly/test/ForwardOpTree/forward_load_indirect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward an operand tree consisting of a speculatable instruction (%add)
 ; and a load (%val).
diff --git a/polly/test/ForwardOpTree/forward_load_memset_after.ll b/polly/test/ForwardOpTree/forward_load_memset_after.ll
index bd2cad411eccf..b889783d531e6 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_after.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_after.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_memset_before.ll b/polly/test/ForwardOpTree/forward_load_memset_before.ll
index 3e89dea37775c..c8f0e0e5814fb 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_before.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_before.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_tripleuse.ll b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
index 7526a8313945d..df57bf70cc53b 100644
--- a/polly/test/ForwardOpTree/forward_load_tripleuse.ll
+++ b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>,polly-codegen' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; %val1 is used three times: Twice by its own operand tree of %val2 and once
 ; more by the store in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
index daf289d8b0da1..ba84a1a16748f 100644
--- a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
+++ b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/ForwardOpTree/forward_phi_load.ll b/polly/test/ForwardOpTree/forward_phi_load.ll
index 1457aa96e2de7..c763af4269c89 100644
--- a/polly/test/ForwardOpTree/forward_phi_load.ll
+++ b/polly/test/ForwardOpTree/forward_phi_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_readonly.ll b/polly/test/ForwardOpTree/forward_readonly.ll
index 646121c4efeff..69c7f10be4e56 100644
--- a/polly/test/ForwardOpTree/forward_readonly.ll
+++ b/polly/test/ForwardOpTree/forward_readonly.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_reusue.ll b/polly/test/ForwardOpTree/forward_reusue.ll
index d8ad31782ecb9..e39e7b51dc689 100644
--- a/polly/test/ForwardOpTree/forward_reusue.ll
+++ b/polly/test/ForwardOpTree/forward_reusue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move operand tree without duplicating values used multiple times.
 ;
diff --git a/polly/test/ForwardOpTree/forward_store.ll b/polly/test/ForwardOpTree/forward_store.ll
index 17cb8b395eb30..8cd6e2446ff93 100644
--- a/polly/test/ForwardOpTree/forward_store.ll
+++ b/polly/test/ForwardOpTree/forward_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
index 57b68180bb121..f70965f3c5d1b 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Copy %val to bodyB, assuming the exit value of %i.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
index b4828e4c2c423..c95c45856ac36 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test support for (synthesizable) inducation variables.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
index 3228bb60d2ca2..14fb8d8dcc0ab 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Synthesizable values defined outside of a loop can be used
 ; inside the loop.
diff --git a/polly/test/ForwardOpTree/forward_transitive.ll b/polly/test/ForwardOpTree/forward_transitive.ll
index aacf1358648f5..7b55d9e0cf9b2 100644
--- a/polly/test/ForwardOpTree/forward_transitive.ll
+++ b/polly/test/ForwardOpTree/forward_transitive.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %v and %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll
index cb035bb749c7b..3bc504d88c0eb 100644
--- a/polly/test/ForwardOpTree/jacobi-1d.ll
+++ b/polly/test/ForwardOpTree/jacobi-1d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/noforward_from_region.ll b/polly/test/ForwardOpTree/noforward_from_region.ll
index bd5864c25f543..0729241c3f7d9 100644
--- a/polly/test/ForwardOpTree/noforward_from_region.ll
+++ b/polly/test/ForwardOpTree/noforward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Ensure we do not move instructions from region statements in case the
 ; instruction to move loads from an array which is also written to from
diff --git a/polly/test/ForwardOpTree/noforward_load_conditional.ll b/polly/test/ForwardOpTree/noforward_load_conditional.ll
index 5474e740de800..d33ef99ae6bed 100644
--- a/polly/test/ForwardOpTree/noforward_load_conditional.ll
+++ b/polly/test/ForwardOpTree/noforward_load_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; B[j] is overwritten by at least one statement between the
 ; definition of %val and its use. Hence, it cannot be forwarded.
diff --git a/polly/test/ForwardOpTree/noforward_load_writebetween.ll b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
index 697c940be4fdd..e7deb381de87a 100644
--- a/polly/test/ForwardOpTree/noforward_load_writebetween.ll
+++ b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Cannot rematerialize %val from B[0] at bodyC because B[0] has been
 ; overwritten in bodyB.
diff --git a/polly/test/ForwardOpTree/noforward_outofquota.ll b/polly/test/ForwardOpTree/noforward_outofquota.ll
index 306bb8d7558d1..5e30cf88de4cf 100644
--- a/polly/test/ForwardOpTree/noforward_outofquota.ll
+++ b/polly/test/ForwardOpTree/noforward_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
-; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 -passes=polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ; for (int j = 0; j < n; j += 1) {
diff --git a/polly/test/ForwardOpTree/noforward_partial.ll b/polly/test/ForwardOpTree/noforward_partial.ll
index edb5d34801cc5..f95bb77f70b67 100644
--- a/polly/test/ForwardOpTree/noforward_partial.ll
+++ b/polly/test/ForwardOpTree/noforward_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Not the entire operand tree can be forwarded,
 ; some scalar dependencies would remain.
diff --git a/polly/test/ForwardOpTree/noforward_phi.ll b/polly/test/ForwardOpTree/noforward_phi.ll
index 755abad4336ef..025fe64724151 100644
--- a/polly/test/ForwardOpTree/noforward_phi.ll
+++ b/polly/test/ForwardOpTree/noforward_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not move PHI nodes.
 ;
diff --git a/polly/test/ForwardOpTree/noforward_selfrefphi.ll b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
index be7e82f726331..8b30137858243 100644
--- a/polly/test/ForwardOpTree/noforward_selfrefphi.ll
+++ b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Contains a self-referencing PHINode that would require a
 ; transitive closure to handle.
diff --git a/polly/test/ForwardOpTree/noforward_sideffects.ll b/polly/test/ForwardOpTree/noforward_sideffects.ll
index c01b72a1c1420..179b02a259025 100644
--- a/polly/test/ForwardOpTree/noforward_sideffects.ll
+++ b/polly/test/ForwardOpTree/noforward_sideffects.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not forward instructions with side-effects (here: function call).
 ;
diff --git a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
index 776d848072a23..6baec6d9e1c6c 100644
--- a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
+++ b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not try to forward %i.trunc, it is not synthesizable in %body.
 ;
diff --git a/polly/test/ForwardOpTree/out-of-quota1.ll b/polly/test/ForwardOpTree/out-of-quota1.ll
index ee3e32698dd02..95df49a5c061a 100644
--- a/polly/test/ForwardOpTree/out-of-quota1.ll
+++ b/polly/test/ForwardOpTree/out-of-quota1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output %s | FileCheck %s
 
 ; This used to loop infinitely because of UINT_MAX returned by ISL on out-of-quota.
 
diff --git a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
index ec1ccdce94508..a5102b3557f0c 100644
--- a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 ;
 ;       void jd(int *A) {
 ; CHECK:  #pragma omp parallel for
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
index 9c00690605408..d086b59f97a5a 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
index 356762a2ae5b9..49a6b0531de56 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; int A[1024][1024];
 ; void bar(int n) {
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
index 066fc39def6ac..d2d7917b08528 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
index 77dd55cb7605e..c03189a211256 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
index b61ebc9379b7f..6829211cc76b9 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
index 5c92a91681867..7199a337d8a4f 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
index 352d879199675..41d35bfdb3631 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
index 81c29536010b6..356269cefad36 100644
--- a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
+++ b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
-; RUN:     | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/IstAstInfo/alias_simple_1.ll b/polly/test/IstAstInfo/alias_simple_1.ll
index 904f55dc32ce4..039c5f74fabfe 100644
--- a/polly/test/IstAstInfo/alias_simple_1.ll
+++ b/polly/test/IstAstInfo/alias_simple_1.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_2.ll b/polly/test/IstAstInfo/alias_simple_2.ll
index 5fae579995b23..1783a04f02be9 100644
--- a/polly/test/IstAstInfo/alias_simple_2.ll
+++ b/polly/test/IstAstInfo/alias_simple_2.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 ;
 ;    int A[1024], B[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_3.ll b/polly/test/IstAstInfo/alias_simple_3.ll
index 8599c29934744..8d507fb82cb2d 100644
--- a/polly/test/IstAstInfo/alias_simple_3.ll
+++ b/polly/test/IstAstInfo/alias_simple_3.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;    float B[1024];
diff --git a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
index dc21dc1f96a48..01b5372917358 100644
--- a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
+++ b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
-; RUN:   -polly-invariant-load-hoisting \
-; RUN:   | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output -polly-invariant-load-hoisting < %s | FileCheck %s
 
 ; CHECK: if (1 && 1 && (&MemRef_X[1] <= &MemRef_BaseA[0] || &MemRef_BaseA[1024] <= &MemRef_X[0]) && (&MemRef_X[1] <= &MemRef_BaseB[0] || &MemRef_BaseB[1024] <= &MemRef_X[0]))
 
diff --git a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
index 8d4adfa405f07..3835c23fecddb 100644
--- a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=      -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
index be37b27b6e375..71bac9a2bb141 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
index 15550583340db..e5ece1f57a85e 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/dependence_distance_constant.ll b/polly/test/IstAstInfo/dependence_distance_constant.ll
index 9b7fb93f2f676..43b13eef9a95b 100644
--- a/polly/test/IstAstInfo/dependence_distance_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *A, int N) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_minimal.ll b/polly/test/IstAstInfo/dependence_distance_minimal.ll
index d69cc3f9fc3f8..35a503ce7eb8d 100644
--- a/polly/test/IstAstInfo/dependence_distance_minimal.ll
+++ b/polly/test/IstAstInfo/dependence_distance_minimal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; The minimal dependence distance of the innermost loop should be 1 instead of 250.
 ; CHECK:    #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
index bc21e9e07ad89..a7de5c4876385 100644
--- a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *restrict A, int *restrict B, int N) {
 ; CHECK:   #pragma minimal dependence distance: 5
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric.ll b/polly/test/IstAstInfo/dependence_distance_parametric.ll
index fa569a8386b86..fa05e4c889031 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *A, int N, int c) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
index 7f280e0c542ca..73f74b3bce0b1 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *A, int N, int c, int v) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_varying.ll b/polly/test/IstAstInfo/dependence_distance_varying.ll
index d609c2f210f8d..e908954536600 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;         void f(int *A, int N) {
 ; CHECK:    #pragma minimal dependence distance: -(N % 2) + 2
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
index 8ed3220353c1b..1668fc0515441 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-canonicalize -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *restrict A, int *restrict sum) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
index 73768e9c308a4..0d0aa8bea31d8 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D,
 ;               int *restrict E, int N) {
diff --git a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
index e2cf0bd9c0df2..2ed94e59e8087 100644
--- a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
+++ b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ; CHECK:      {
 ; CHECK-NEXT:    if (p <= -1 || p >= 1)
diff --git a/polly/test/IstAstInfo/non_affine_access.ll b/polly/test/IstAstInfo/non_affine_access.ll
index 98e8d2db959f8..a285a8f032f5e 100644
--- a/polly/test/IstAstInfo/non_affine_access.ll
+++ b/polly/test/IstAstInfo/non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ;    void non_affine_access(float A[]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
index 697b6ca50d444..3fefc74efbef0 100644
--- a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int P, int sum[P][M]) {
diff --git a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
index c20a7d6db13c9..41bd178c73c2a 100644
--- a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int *sum) {
diff --git a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
index e6092f0b068f8..5aa8a0c244423 100644
--- a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we do __not__ parallelize anything
diff --git a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
index 14de70f9357c3..91f7c9d9601bc 100644
--- a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
+++ b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma simd reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
 ; CHECK: #pragma known-parallel reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
diff --git a/polly/test/IstAstInfo/reduction_in_one_dimension.ll b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
index 797115b6f8d70..d0173bcd978ca 100644
--- a/polly/test/IstAstInfo/reduction_in_one_dimension.ll
+++ b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that we won't privatize anything in the outer dimension
 ;
diff --git a/polly/test/IstAstInfo/reduction_loop_reversal.ll b/polly/test/IstAstInfo/reduction_loop_reversal.ll
index d30119787d8e0..d010e26f739a6 100644
--- a/polly/test/IstAstInfo/reduction_loop_reversal.ll
+++ b/polly/test/IstAstInfo/reduction_loop_reversal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT: #pragma simd{{\s*$}}
 ; CHECK: #pragma simd reduction
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
index 15fca884c2b63..7f78badfcb93c 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
index 44e9aa4d1e569..42e9c3b19eb1b 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel reduction
 ; CHECK:    for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
index c39ffa591484d..8bdd5299986eb 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
index 266753555cab1..4811069e4f399 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel
 ; CHECK:    for (int c0 = 0; c0 <= 1; c0 += 1)
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
index d7f9029fd347a..4f5ac24a0b005 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesn't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
index f18060a2e20a8..472a04847ec95 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesn't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
index 8e2a590c5f57c..2cc911d78234b 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesn't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
index b889db4819cd5..1b2d0eb75c12c 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the outer dimension needs privatization
 ;
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
index 2a8fd7a4f670e..884cea7918031 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
index 25f2fa597e34e..013a7d4f3ad27 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
index 0d6be9a9da9bf..2dc6d8680b36a 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
index 8b537513cc8d7..dcd75945d25a8 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/run-time-condition.ll b/polly/test/IstAstInfo/run-time-condition.ll
index 44d3534f651ce..67fc4b74571da 100644
--- a/polly/test/IstAstInfo/run-time-condition.ll
+++ b/polly/test/IstAstInfo/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
index aef509a865b6a..d674f429c0d48 100644
--- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
+++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not simplify the runtime check to "true" due to the domain
 ; constraints as the test contains an error block that influenced the domains
diff --git a/polly/test/IstAstInfo/simple-run-time-condition.ll b/polly/test/IstAstInfo/simple-run-time-condition.ll
index 488cd180b899a..73a7c596cea0b 100644
--- a/polly/test/IstAstInfo/simple-run-time-condition.ll
+++ b/polly/test/IstAstInfo/simple-run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/IstAstInfo/single_loop_strip_mine.ll b/polly/test/IstAstInfo/single_loop_strip_mine.ll
index afe6179188c01..f546972fb370c 100644
--- a/polly/test/IstAstInfo/single_loop_strip_mine.ll
+++ b/polly/test/IstAstInfo/single_loop_strip_mine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
index f614f90fc3fc9..c9ae9e8f4e52e 100644
--- a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
index e91ea13278692..45227160e8699 100644
--- a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
index 49a962592bb9d..28b6a7ca12799 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
index 749b962b260f5..f19a632815795 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement from JScop file has no key name 'accesses' for index 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
index 1d97e3ebca625..77b9acfbb0989 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of memory accesses in the JSop file and the number of memory accesses differ for index 0.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
index f4b739398f9f6..0a06ff671c298 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
index 1f5cda3518a2f..35b7af098ae42 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Memory access number 0 has no key name 'relation' for statement number 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
index 0c750849b51eb..109665a85c607 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
index d8c9c3f4ab2ea..f345d1c31796e 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file contains access function with undeclared ScopArrayInfo
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
index f8d7cb8c1453e..a66d5c8c69b55 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file changes the number of parameter dimensions.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
index 6e13a5e413d76..ae0b4edffb5fc 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
@@ -1,4 +1,4 @@
-  ; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has not a valid type.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
index 7f6578776e0bd..6c434e15a38d2 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; #define Ni 1056
 ; #define Nj 1056
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
index e698bdc488c2c..b004c4725176a 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'name'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
index f130b6556e3e5..5f62a457f63eb 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'sizes'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
index 68d2e50c6730d..029fde10f5a4a 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'type'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
index 94c77dc2a0138..9ac371b655146 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key named 'context'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
index c20d5c02d662e..82afcd95c871f 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The isl_set is not a parameter set.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
index 92f4d61212e93..0308452c6f955 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: unexpected isl_token
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
index 89668d8d573b1..debb9bc604110 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Imported context has the wrong number of parameters : Found 2 Expected 1
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
index efe15c14ce90d..6eee0056ba0b5 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement 0 has no 'schedule' key.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
index db516f6d7d335..59feb0085e6de 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
index b93c984d7d9dd..78d5243d34e00 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
index 3fa14c64cd639..877547c8f317f 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
index 1d81ff7ef2dc8..9f999204f59bf 100644
--- a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
+++ b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the expansion of an array with load after store in a same statement is not done.
 ;
diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll
index 57017381c661a..1a733c113626d 100644
--- a/polly/test/MaximalStaticExpansion/read_from_original.ll
+++ b/polly/test/MaximalStaticExpansion/read_from_original.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
 ;
diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll
index 7e33de17a1749..a7aa162aa83da 100644
--- a/polly/test/MaximalStaticExpansion/too_many_writes.ll
+++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
index 355fc02600d54..06e08c43e3492 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
index 930539547cc97..076f47143dbcc 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array and MemoryKind::PHI.
 ; tmp_06_phi is not expanded because it need copy in.
diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll
index a055e50225e91..2b040f3f1f4e3 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
index 77338c9aac200..f863c0e1d6edf 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
index 9cfa5536072b7..a823bdb4e7682 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
index 63e4d48046275..0898f99c896d4 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
 ; tmp_04 is not expanded because it need copy-in.
diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
index 87bd57abab8d1..2a332ba7ce77b 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
 ; tmp_05 and tmp2_06 are not expanded because they need copy-in.
diff --git a/polly/test/MaximalStaticExpansion/working_value_expansion.ll b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
index cc28a78c38671..77f20bb163a8b 100644
--- a/polly/test/MaximalStaticExpansion/working_value_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Value
 ;
diff --git a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
index 9cc2aecf002dd..b4524c21a35ee 100644
--- a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
+++ b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false "-passes=scop(polly-prune-unprofitable)" -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<prune>' -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
 ; REQUIRES: asserts
 ;
 ; Skip this SCoP for having scalar dependencies between all statements,
diff --git a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
index 38facb1688c46..c8c006c94d1d4 100644
--- a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
+++ b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
 define void @sdbout_label() nounwind {
diff --git a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
index 835986049899b..23033faa380af 100644
--- a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
+++ b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Check that we handle statements with an empty iteration domain correctly.
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
index 5e4ce8225a236..fdaed3c543673 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr  noalias nonnull %A,  ptr  noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
index de4c387a1d879..65d495722c2bd 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
index 91bd549c3c7e4..06d55f46a977f 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
index 8b69d9e12c0fe..0af703ccf5ffe 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
index 49d1124740340..ca6840b900e7f 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
index a449a2fda9ba3..f96e4baba71eb 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
index 798e9b9a7c14f..229d13aaf1a4d 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
index 4d0ccc988a5cc..9bc9a25ac588e 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
 
 ; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions).
 
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
index bf470b91a7022..5b0cefbe686f6 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
index b0f75dd50ef83..2225f05f6717d 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Check that the disable_nonforced metadata is honored; optimization
 ; heuristics/rescheduling must not be applied.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
index 900360d7533f8..4add219214aa3 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
 ;
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
index d45b62433dbbc..d59f9e58e2785 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:2:3: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
index d835e66693fb4..a1caaf5db5a61 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:1:42: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
index a5781a7f60365..b05710203fd37 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Override unroll metadata with llvm.loop.unroll.disable.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
index cccf136a1c4ac..8992bc942646e 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Apply two loop transformations. First partial, then full unrolling.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
index 4d499078a4364..7bea96f791a80 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Full unroll of a loop with 5 iterations.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
index d67472ab86936..34a6f486e646c 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Unrolling with heuristic factor.
 ; Currently not supported and expected to be handled by LLVM's unroll pass.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
index 90101b4fde390..ce2281372a20d 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
 ;
 ; Partial unroll by a factor of 4.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
index 4cfa3fb911515..f6810ba6c48fb 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
-; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
-; RUN: opt %loadNPMPolly '-passes=scop(polly-opt-isl,polly-codegen),simplifycfg' -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Partial unroll by a factor of 4.
 ;
@@ -54,6 +54,6 @@ return:
 ; AST-NEXT:  for (int c0 = 0; c0 < n; c0 += 4) {
 
 
-; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.exiting, !llvm.loop ![[LOOPID:[0-9]+]]
+; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit, !llvm.loop ![[LOOPID:[0-9]+]]
 ; CODEGEN: ![[LOOPID]] = distinct !{![[LOOPID]], ![[LOOPNAME:[0-9]+]]}
 ; CODEGEN: ![[LOOPNAME]] = !{!"llvm.loop.id", !"This-is-the-unrolled-loop"}
diff --git a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
index 3f6f50e34775d..b03d475dd42ee 100644
--- a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
+++ b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-opt-isl,print<polly-ast>,polly-codegen' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-custom<opt-isl;ast;codegen>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ; Check that there are no nested #pragma omp parallel for inside a
 ; #pragma omp parallel for loop.
diff --git a/polly/test/ScheduleOptimizer/computeout.ll b/polly/test/ScheduleOptimizer/computeout.ll
index a3286b481ffb3..6f34f4efc0a6d 100644
--- a/polly/test/ScheduleOptimizer/computeout.ll
+++ b/polly/test/ScheduleOptimizer/computeout.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
index 928ee858ae6d2..4be0b948d09a0 100644
--- a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
+++ b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
@@ -1,9 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal \
-; RUN:     '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
-; RUN:     -polly-target-throughput-vector-fma=1 \
-; RUN:     -polly-target-latency-vector-fma=1 \
-; RUN:     -polly-target-vector-register-bitwidth=4096 \
-; RUN:     -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=1 -polly-target-vector-register-bitwidth=4096 -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s
 ;
 ;     /* Test that Polly does not crash due to configurations that can lead to
 ;    incorrect tile size computations.
diff --git a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
index b533cb870bdcb..548a8aa94afbf 100644
--- a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
+++ b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-opt-isl>)" -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR46578
 ;
diff --git a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
index 3dd579ed736f7..6de5e3a606aa3 100644
--- a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
+++ b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ; CHECK:          // 1st level tiling - Tiles
 ; CHECK-NEXT:    #pragma known-parallel
 ; CHECK-NEXT:    for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling-2.ll b/polly/test/ScheduleOptimizer/line-tiling-2.ll
index 3a2c566d19d3d..6256adfcd6917 100644
--- a/polly/test/ScheduleOptimizer/line-tiling-2.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 7; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling.ll b/polly/test/ScheduleOptimizer/line-tiling.ll
index 0dbdeff4742b9..51e02594aa880 100644
--- a/polly/test/ScheduleOptimizer/line-tiling.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 511; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
index 8f270b94617fe..79deedc7cd830 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
@@ -1,13 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-optimized-scops \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-optimized-scops -polly-target-vector-register-bitwidth=256 -disable-output < %s
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
index de1c815f92350..e3ae1a02bd347 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    /* _PB_NK % Kc != 0 */
diff --git a/polly/test/ScheduleOptimizer/one-dimensional-band.ll b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
index a097d4a43cfd2..f37f1e5119a9f 100644
--- a/polly/test/ScheduleOptimizer/one-dimensional-band.ll
+++ b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;    void jacobi1d(long T, long N, float *A, float *B) {
 ;      long t, i, j;
diff --git a/polly/test/ScheduleOptimizer/outer_coincidence.ll b/polly/test/ScheduleOptimizer/outer_coincidence.ll
index 7c1af80c9ffae..e0a7a63cda80d 100644
--- a/polly/test/ScheduleOptimizer/outer_coincidence.ll
+++ b/polly/test/ScheduleOptimizer/outer_coincidence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no  '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=OUTER
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=OUTER
 
 ; By skewing, the diagonal can be made parallel. ISL does this when the Check
 ; the 'outer_coincidence' option is enabled.
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index a19b93d9915dd..84f1ca0dba652 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,8 +1,4 @@
-; RUN: opt %loadNPMPolly \
-; RUN: -polly-pattern-matching-based-opts=true \
-; RUN: '-passes=polly-optree,polly-delicm,polly-simplify,polly-opt-isl' \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true '-passes=polly-custom<optree;delicm;simplify;opt-isl>' -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Check that the pattern matching detects the matrix multiplication pattern
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
index 4ef0605a0ba75..72fb4f1b4e41c 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-opt-isl' \
-; RUN: -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm;simplify-1;opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Check that the pattern matching detects the tensor contraction pattern
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
index 09118e252233b..933b2d4d258e7 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -1,8 +1,7 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=false \
-; RUN: -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
-; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=false -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
index b771d1f87537e..03e23038877e5 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -1,16 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl' \
-; RUN: -polly-import-jscop-postfix=transformed \
-; RUN: -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -debug \
-; RUN: -polly-tc-opt=true -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl>' -polly-import-jscop-postfix=transformed -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Check that the pattern matching detects the matrix multiplication pattern
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
index 238f6dd798e68..4e174e3c9723d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -passes=polly-opt-isl -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
index 0e4540eb7ba3c..c3d8b6ed3fee5 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=2 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=128 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=2 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=128 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
index 9678ad83ff048..3705c3fd27ed9 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
@@ -1,13 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl,polly-codegen'  \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-import-jscop-postfix=transformed -S < %s \
-; RUN: | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl;ast;codegen>' -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Check that we disable the Loop Vectorizer.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
index e74884d59c311..7ada105828b27 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug-only=polly-opt-isl -disable-output \
-; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug-only=polly-opt-isl -disable-output -polly-tc-opt=true < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (i = 0; i < _PB_NI; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
index 9c99a090b69e7..6647380b2d070 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 1024; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
index 8e14035ce8629..fba77d5e4f82d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
index 4f562c306f96a..488436064ae83 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
index 32ded897d4ff9..c7a5d475bef31 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;   for (i = 0; i < 8; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
index f0c0177da84b0..1dba8bece8072 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
index 155177bdfade0..3656a9457cef2 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;   for (i = 0; i < 16; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
index 3d21ac3859a7e..bd0cb054957af 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (int i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
index 00a4bf885aef8..6e6788be2973f 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (int i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
index bfe5c5249a3a8..82356ae0a398d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -passes=polly-opt-isl \
-; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
-; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 1024; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
index a2e1ced3e6320..ea28bb8c0bdb6 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (int i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
index 9844d377e609d..f80d63cd4d66c 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -1,19 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-size=0 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=0 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s
 
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
index 250641d57bac5..100b17e2ccd21 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -1,13 +1,5 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-tc-opt=true -disable-output < %s |  \
-; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-tc-opt=true -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; REQUIRES: asserts
 ;
 ;    C := A * B + C
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
index ad2c195ba1e8e..050af1b2377d3 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; -polly-target-throughput-vector-fma=1 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
index 1d3cdbdbfdd85..ba1ddfef6a4e4 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;  opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ;  -polly-target-throughput-vector-fma=1 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
index 59eaa4a0928e9..e50b3a0a3f2b0 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the float type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
index 2544d502a2dc5..3f57fe8cf6c73 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
@@ -1,12 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices B, C have the double type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
index 85c143562f5af..b87ed4fb1ec3c 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
@@ -1,14 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
-; RUN: -polly-target-throughput-vector-fma=1 \
-; RUN: -polly-target-latency-vector-fma=8 \
-; RUN: -polly-target-1st-cache-level-associativity=8 \
-; RUN: -polly-target-2nd-cache-level-associativity=8 \
-; RUN: -polly-target-1st-cache-level-size=32768 \
-; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -passes=polly-opt-isl -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s
 ;
-; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
+; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the char type. */
diff --git a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
index 64285891a16c7..98c1db6d36fbe 100644
--- a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
+++ b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -passes=polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;opt-isl>' -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) {
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index 1c6d289744e39..4784dc88cd307 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 1ff20d165ce5e..6d1592c4ba8fa 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine                         '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=VEC16
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=VEC16
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
index 0bc3c2cf642e8..f346e5365b198 100644
--- a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl>' -polly-debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) {
diff --git a/polly/test/ScheduleOptimizer/rectangular-tiling.ll b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
index e1d768b351d7d..3fd4907909419 100644
--- a/polly/test/ScheduleOptimizer/rectangular-tiling.ll
+++ b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16                                                                                                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false                                                                                                                    '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=NOTILING
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8                                                                               '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=NOTILING
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
 
 ; CHECK: // 1st level tiling - Tiles
 ; CHECK: for (int c0 = 0; c0 <= 3; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
index 1e1359e3ecc6a..1ee8a90473bd3 100644
--- a/polly/test/ScheduleOptimizer/schedule_computeout.ll
+++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -passes=polly-optree -passes=polly-delicm  -passes=polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly-custom<optree;delicm;opt-isl>' -polly-schedule-computeout=10000 -debug-only=polly-opt-isl < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Bailout if the computations of schedule compute exceeds the max scheduling quota.
diff --git a/polly/test/ScheduleOptimizer/statistics.ll b/polly/test/ScheduleOptimizer/statistics.ll
index 84eb59341d273..bb705ac6abf38 100644
--- a/polly/test/ScheduleOptimizer/statistics.ll
+++ b/polly/test/ScheduleOptimizer/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScheduleOptimizer/tile_after_fusion.ll b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
index 50a46d66176ea..e3d7c24ebef77 100644
--- a/polly/test/ScheduleOptimizer/tile_after_fusion.ll
+++ b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
 ;
 ;
 ;    void tf(int C[256][256][256], int A0[256][256][256], int A1[256][256][256]) {
diff --git a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
index e59a31665d77b..bb472b9c3763f 100644
--- a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
+++ b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
 
 ; isl_schedule_node_band_sink may sink into multiple children.
 ; https://llvm.org/PR52637
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
index cee1c06cf7aa0..d83c822371b6e 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
index 5506b3c626cfd..63c9addd0b6e1 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_1.ll b/polly/test/ScopDetect/aliasing_simple_1.ll
index 5f43ec1856a7f..ea8a7688f3d25 100644
--- a/polly/test/ScopDetect/aliasing_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_2.ll b/polly/test/ScopDetect/aliasing_simple_2.ll
index e853dfcc64485..df68289ff7352 100644
--- a/polly/test/ScopDetect/aliasing_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/base_pointer.ll b/polly/test/ScopDetect/base_pointer.ll
index e500f9bc20bc6..0f0e219bd90d1 100644
--- a/polly/test/ScopDetect/base_pointer.ll
+++ b/polly/test/ScopDetect/base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-invariant-load-hoisting=true -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly --aa-pipeline= -polly-invariant-load-hoisting=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | FileCheck %s
 
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
index eeb9e11f812c3..b00ec77679063 100644
--- a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts;import-jscop>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This violated an assertion in setNewAccessRelation that assumed base pointers
 ; to be load-hoisted. Without this assertion, it codegen would generate invalid
diff --git a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
index 16976e6313275..1cd04b639fc99 100644
--- a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;import-jscop>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --allow-empty
 ;
 ; Polly codegen used to generate invalid code (referring to %ptr from the
 ; original region) when regeneration of the access function is necessary.
@@ -35,3 +35,5 @@ exit:
 
 
 ; CHECK-NOT: Valid Region for Scop
+; CHECK: Detected Scops in Function base_pointer_is_inst_inside_invariant_1
+; CHECK-NOT: Valid Region for Scop
diff --git a/polly/test/ScopDetect/callbr.ll b/polly/test/ScopDetect/callbr.ll
index 4182974693678..4200339a04a13 100644
--- a/polly/test/ScopDetect/callbr.ll
+++ b/polly/test/ScopDetect/callbr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -stats                            < %s 2>&1 | FileCheck %s --check-prefix=STAT
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STAT
 ; REQUIRES: asserts
 
 ; REMARK: Branch from indirect terminator.
diff --git a/polly/test/ScopDetect/collective_invariant_loads.ll b/polly/test/ScopDetect/collective_invariant_loads.ll
index f451bccec706f..f5263e4e4c40a 100644
--- a/polly/test/ScopDetect/collective_invariant_loads.ll
+++ b/polly/test/ScopDetect/collective_invariant_loads.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting -disable-output < %s 2>&1 | FileCheck %s
 
 ;CHECK:     Function: test_init_chpl
 ;CHECK-NEXT:     Region: %bb1---%bb16
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit.ll b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
index fe3922174c07c..d7605c36d449c 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
index 4cac173932a6f..c3a2ad4791ba7 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
index 7d7476471bb6e..e896e18589e94 100644
--- a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
+++ b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define void @f(ptr %A, i64 %N, i64 %M) nounwind {
diff --git a/polly/test/ScopDetect/detect-full-functions.ll b/polly/test/ScopDetect/detect-full-functions.ll
index 178ef32827cab..adad0e89ffa42 100644
--- a/polly/test/ScopDetect/detect-full-functions.ll
+++ b/polly/test/ScopDetect/detect-full-functions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-process-unprofitable=false -disable-output -polly-detect-full-functions  < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s
 
 ; Verify if a simple function with basic block not part of loop doesn't crash with polly-process-unprofitable=false and polly-detect-full-functions flags.
 
diff --git a/polly/test/ScopDetect/dom-tree-crash.ll b/polly/test/ScopDetect/dom-tree-crash.ll
index efc732c50e177..0f670ca230824 100644
--- a/polly/test/ScopDetect/dom-tree-crash.ll
+++ b/polly/test/ScopDetect/dom-tree-crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Detected Scops in Function foo
 
diff --git a/polly/test/ScopDetect/dot-scops-npm.ll b/polly/test/ScopDetect/dot-scops-npm.ll
index d14bf8a23a166..de1f52813475a 100644
--- a/polly/test/ScopDetect/dot-scops-npm.ll
+++ b/polly/test/ScopDetect/dot-scops-npm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-scop-printer' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-dot -disable-output < %s
 ; RUN: FileCheck %s -input-file=scops.func_npm.dot
 ;
 ; Check that the ScopPrinter does not crash.
diff --git a/polly/test/ScopDetect/dot-scops.ll b/polly/test/ScopDetect/dot-scops.ll
index 63163b23617cf..a719d21300b15 100644
--- a/polly/test/ScopDetect/dot-scops.ll
+++ b/polly/test/ScopDetect/dot-scops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,polly-scop-printer' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; Check that the ScopPrinter does not crash.
 ; ScopPrinter needs the ScopDetection pass, which should depend on
diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll
index 20d02b1c1ae0b..0e82e37d10095 100644
--- a/polly/test/ScopDetect/error-block-always-executed.ll
+++ b/polly/test/ScopDetect/error-block-always-executed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
index 6c66f6df14af5..338fe20679bcf 100644
--- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll
+++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-unreachable.ll b/polly/test/ScopDetect/error-block-unreachable.ll
index 6ba7698a972bb..85f248da9be18 100644
--- a/polly/test/ScopDetect/error-block-unreachable.ll
+++ b/polly/test/ScopDetect/error-block-unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s
 
 ; Verify that the scop detection does not crash on inputs with unreachable
 ; blocks. Earlier we crashed when detecting error blocks.
diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll
index a5c9626d28361..43fdda8321cbe 100644
--- a/polly/test/ScopDetect/expand-region-correctly-2.ll
+++ b/polly/test/ScopDetect/expand-region-correctly-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 ;
diff --git a/polly/test/ScopDetect/expand-region-correctly.ll b/polly/test/ScopDetect/expand-region-correctly.ll
index a8c90c08fde0c..b4caac4478d1d 100644
--- a/polly/test/ScopDetect/expand-region-correctly.ll
+++ b/polly/test/ScopDetect/expand-region-correctly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 
diff --git a/polly/test/ScopDetect/ignore_func_flag_regex.ll b/polly/test/ScopDetect/ignore_func_flag_regex.ll
index a75e705995a75..ef1c66686251a 100644
--- a/polly/test/ScopDetect/ignore_func_flag_regex.ll
+++ b/polly/test/ScopDetect/ignore_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-polly-ignore-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-ignore-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop.ll b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
index f6d6cfab0eede..a6f7079f68407 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
index 16d47619b0ff2..be76e0b138933 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/indvars.ll b/polly/test/ScopDetect/indvars.ll
index 3fbc4d65bbe20..e45e4fb016155 100644
--- a/polly/test/ScopDetect/indvars.ll
+++ b/polly/test/ScopDetect/indvars.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll
index 58c9197f7f799..43fa4ca619ed7 100644
--- a/polly/test/ScopDetect/intrinsics_1.ll
+++ b/polly/test/ScopDetect/intrinsics_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: for.cond => for.end
 ;
diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll
index f71016e6d04cd..b4cc3df7c746b 100644
--- a/polly/test/ScopDetect/intrinsics_2.ll
+++ b/polly/test/ScopDetect/intrinsics_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the lifetime markers for the tmp array.
 ;
diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll
index 579d5bd481d44..08fdee573ba0f 100644
--- a/polly/test/ScopDetect/intrinsics_3.ll
+++ b/polly/test/ScopDetect/intrinsics_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the misc intrinsics.
 ;
diff --git a/polly/test/ScopDetect/invalid-latch-conditions.ll b/polly/test/ScopDetect/invalid-latch-conditions.ll
index db4898c9c7bd7..c7d7c51e7d220 100644
--- a/polly/test/ScopDetect/invalid-latch-conditions.ll
+++ b/polly/test/ScopDetect/invalid-latch-conditions.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly                              -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 
 ; The latch conditions of the outer loop are not affine, thus the loop cannot
 ; handled by the domain generation and needs to be overapproximated.
diff --git a/polly/test/ScopDetect/invalidate_scalar_evolution.ll b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
index ddef510ad4d9f..977918eb5168d 100644
--- a/polly/test/ScopDetect/invalidate_scalar_evolution.ll
+++ b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/invariant-load-before-scop.ll b/polly/test/ScopDetect/invariant-load-before-scop.ll
index 10479643959cb..932c218170caf 100644
--- a/polly/test/ScopDetect/invariant-load-before-scop.ll
+++ b/polly/test/ScopDetect/invariant-load-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; The LoadInst %.b761 is defined outside the SCoP, hence is always constant
 ; within it. It is no "required invariant load".
diff --git a/polly/test/ScopDetect/keep_going_expansion.ll b/polly/test/ScopDetect/keep_going_expansion.ll
index 074aae9ae95c9..efd81c695ca0d 100644
--- a/polly/test/ScopDetect/keep_going_expansion.ll
+++ b/polly/test/ScopDetect/keep_going_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/mod_ref_read_pointer.ll b/polly/test/ScopDetect/mod_ref_read_pointer.ll
index 64535d85f2ab1..c7972cc47a68d 100644
--- a/polly/test/ScopDetect/mod_ref_read_pointer.ll
+++ b/polly/test/ScopDetect/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop: for.body => for.end
 ; MODREF: Valid Region for Scop: for.body => for.end
diff --git a/polly/test/ScopDetect/more-than-one-loop.ll b/polly/test/ScopDetect/more-than-one-loop.ll
index 30090652326d2..1835342812b1f 100644
--- a/polly/test/ScopDetect/more-than-one-loop.ll
+++ b/polly/test/ScopDetect/more-than-one-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/multidim-with-undef-size.ll b/polly/test/ScopDetect/multidim-with-undef-size.ll
index 2a5f8b15534fa..e89cea98ad21a 100644
--- a/polly/test/ScopDetect/multidim-with-undef-size.ll
+++ b/polly/test/ScopDetect/multidim-with-undef-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb14 => bb17
diff --git a/polly/test/ScopDetect/multidim.ll b/polly/test/ScopDetect/multidim.ll
index 91202373263f0..cbe7d0708b853 100644
--- a/polly/test/ScopDetect/multidim.ll
+++ b/polly/test/ScopDetect/multidim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb19 => bb20
diff --git a/polly/test/ScopDetect/multidim_indirect_access.ll b/polly/test/ScopDetect/multidim_indirect_access.ll
index a9cd446d27670..4af37ba064558 100644
--- a/polly/test/ScopDetect/multidim_indirect_access.ll
+++ b/polly/test/ScopDetect/multidim_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we will recognize this SCoP.
 ;
diff --git a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
index 9c91fbfbe0b64..0286642f3c7a7 100644
--- a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
+++ b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopDetect/nested_loop_single_exit.ll b/polly/test/ScopDetect/nested_loop_single_exit.ll
index a0742112b6e12..89071df596807 100644
--- a/polly/test/ScopDetect/nested_loop_single_exit.ll
+++ b/polly/test/ScopDetect/nested_loop_single_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 
 ; void f(long A[], long N) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/non-affine-conditional.ll b/polly/test/ScopDetect/non-affine-conditional.ll
index e74619cd87756..b20828d9a7679 100644
--- a/polly/test/ScopDetect/non-affine-conditional.ll
+++ b/polly/test/ScopDetect/non-affine-conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-float-compare.ll b/polly/test/ScopDetect/non-affine-float-compare.ll
index 9326cd4290380..77427397bac9d 100644
--- a/polly/test/ScopDetect/non-affine-float-compare.ll
+++ b/polly/test/ScopDetect/non-affine-float-compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
index 1ab6b35ae93f1..f6ae9fe8dd544 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Here we have a non-affine loop but also a non-affine access which should
 ; be rejected as long as -polly-allow-nonaffine isn't given.
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
index 921f6ab535499..23c1765caecac 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
index 78774d92e0a46..6e239a6570668 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop.ll b/polly/test/ScopDetect/non-affine-loop.ll
index 5136b3b8779b1..dd675ccec5999 100644
--- a/polly/test/ScopDetect/non-affine-loop.ll
+++ b/polly/test/ScopDetect/non-affine-loop.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; This function/region does contain a loop, however it is non-affine, hence the access
 ; A[i] is also. Furthermore, it is the only loop, thus when we over approximate
diff --git a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
index fd52c5df7b27e..63b1cdb420b71 100644
--- a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
+++ b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid
 ;
diff --git a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
index d0c1f7a613332..ff4ad3218ffa5 100644
--- a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
+++ b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: bb11 => bb25
 
diff --git a/polly/test/ScopDetect/non-simple-memory-accesses.ll b/polly/test/ScopDetect/non-simple-memory-accesses.ll
index bdc48984f9961..5b9ed2b2ecae7 100644
--- a/polly/test/ScopDetect/non-simple-memory-accesses.ll
+++ b/polly/test/ScopDetect/non-simple-memory-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we do not model atomic memory accesses. We did not reason about
 ; how to handle them correctly and the Alias Set Tracker models some of them
diff --git a/polly/test/ScopDetect/non_affine_loop_condition.ll b/polly/test/ScopDetect/non_affine_loop_condition.ll
index 63bd7b3a2f1f2..3c487374c1973 100644
--- a/polly/test/ScopDetect/non_affine_loop_condition.ll
+++ b/polly/test/ScopDetect/non_affine_loop_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopDetect/only-one-affine-loop.ll b/polly/test/ScopDetect/only-one-affine-loop.ll
index 1d36f4df35bc3..a8ce5bc636833 100644
--- a/polly/test/ScopDetect/only-one-affine-loop.ll
+++ b/polly/test/ScopDetect/only-one-affine-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Even if we allow non-affine loops we can only model the outermost loop, all
 ; other loops are boxed in non-affine regions. However, the inner loops can be
diff --git a/polly/test/ScopDetect/only_func_flag.ll b/polly/test/ScopDetect/only_func_flag.ll
index 4742375fec5cf..f4f35048fa8a0 100644
--- a/polly/test/ScopDetect/only_func_flag.ll
+++ b/polly/test/ScopDetect/only_func_flag.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` limits analysis to `f` and `g`.
 ;
diff --git a/polly/test/ScopDetect/only_func_flag_regex.ll b/polly/test/ScopDetect/only_func_flag_regex.ll
index 2ad22c9f7a7f5..f180fa765f4b0 100644
--- a/polly/test/ScopDetect/only_func_flag_regex.ll
+++ b/polly/test/ScopDetect/only_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-only-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-polly-only-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
index 271825a58c399..71d1ba0accd32 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 
 ; CHECK-NOT: Valid Region
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev.ll b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
index 2ab8997c63331..6768c969a7428 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ;  foo(float *A, long n, long k) {
 ;    if (true)
diff --git a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
index 248bb43aacd98..2e16b75ee3106 100644
--- a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
+++ b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Region with an exit node that has a PHI node multiple incoming edges from
 ; inside the region. Motivation for supporting such cases in Polly.
diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
index d74185b45c752..ac27016e3622d 100644
--- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll
+++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
@@ -1,12 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
-; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
-; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-detect-profitability-min-per-loop-insts=40 '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=true \
-; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
-; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE
 
 ; UNPROFITABLE-NOT: Valid Region for Scop:
 ; PROFITABLE: Valid Region for Scop:
diff --git a/polly/test/ScopDetect/profitability-two-nested-loops.ll b/polly/test/ScopDetect/profitability-two-nested-loops.ll
index 0291d3be452a1..80379bcc5d412 100644
--- a/polly/test/ScopDetect/profitability-two-nested-loops.ll
+++ b/polly/test/ScopDetect/profitability-two-nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: next => bb3
 ;
diff --git a/polly/test/ScopDetect/remove_all_children.ll b/polly/test/ScopDetect/remove_all_children.ll
index d95e9bde0b384..1c77d730ed418 100644
--- a/polly/test/ScopDetect/remove_all_children.ll
+++ b/polly/test/ScopDetect/remove_all_children.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll
index 5e4c38db5e53c..530a22f9ac3d4 100644
--- a/polly/test/ScopDetect/report-scop-location.ll
+++ b/polly/test/ScopDetect/report-scop-location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-report -disable-output < %s  2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-report -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
diff --git a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
index f49190b33ccf7..2ade0a97a5991 100644
--- a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
+++ b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ; CHECK-NOT: Valid Region for Scop:
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetect/run_time_alias_check.ll b/polly/test/ScopDetect/run_time_alias_check.ll
index 74cbedb34e5c6..6f327e318082c 100644
--- a/polly/test/ScopDetect/run_time_alias_check.ll
+++ b/polly/test/ScopDetect/run_time_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll
index f76c832ff08f5..4f03845795c9c 100644
--- a/polly/test/ScopDetect/scev_remove_max.ll
+++ b/polly/test/ScopDetect/scev_remove_max.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s
 
 ; This test case helps to determine whether SCEVRemoveMax::remove produces
 ; an infinite loop and a segmentation fault, if it processes, for example,
diff --git a/polly/test/ScopDetect/sequential_loops.ll b/polly/test/ScopDetect/sequential_loops.ll
index 4a84f356f3e81..338a9ae6b6b0e 100644
--- a/polly/test/ScopDetect/sequential_loops.ll
+++ b/polly/test/ScopDetect/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopDetect/simple_loop.ll b/polly/test/ScopDetect/simple_loop.ll
index 33823b21fb8fb..5da4898517e22 100644
--- a/polly/test/ScopDetect/simple_loop.ll
+++ b/polly/test/ScopDetect/simple_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_entry.ll b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
index 1bba2c21c7473..00e11ab252e73 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit.ll b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
index 93ec84e911c5d..9f75b80f58cef 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
index 33b0d8d7d6fc0..c6ce482403400 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
index 9b47b7c946caf..c90c4915e866d 100644
--- a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
+++ b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_with_param.ll b/polly/test/ScopDetect/simple_loop_with_param.ll
index 4a0a3adab661d..67f677892313c 100644
--- a/polly/test/ScopDetect/simple_loop_with_param.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N, long *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_loop_with_param_2.ll b/polly/test/ScopDetect/simple_loop_with_param_2.ll
index 670936b6fee80..9e7b55efc48d9 100644
--- a/polly/test/ScopDetect/simple_loop_with_param_2.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_non_single_entry.ll b/polly/test/ScopDetect/simple_non_single_entry.ll
index 6ace3b636019b..e56c022aa5466 100644
--- a/polly/test/ScopDetect/simple_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/skip_function_attribute.ll b/polly/test/ScopDetect/skip_function_attribute.ll
index 2150a3e8c35dd..789942a950051 100644
--- a/polly/test/ScopDetect/skip_function_attribute.ll
+++ b/polly/test/ScopDetect/skip_function_attribute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify polly skips this function
 ;
diff --git a/polly/test/ScopDetect/srem_with_parametric_divisor.ll b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
index 66c3b045f62a4..471602968055e 100644
--- a/polly/test/ScopDetect/srem_with_parametric_divisor.ll
+++ b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/statistics.ll b/polly/test/ScopDetect/statistics.ll
index a1dcebec63ff8..5d87599da29f7 100644
--- a/polly/test/ScopDetect/statistics.ll
+++ b/polly/test/ScopDetect/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -stats -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopDetect/switch-in-loop-patch.ll b/polly/test/ScopDetect/switch-in-loop-patch.ll
index 2f9b670384db2..1e825f4950afa 100644
--- a/polly/test/ScopDetect/switch-in-loop-patch.ll
+++ b/polly/test/ScopDetect/switch-in-loop-patch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Valid
 
diff --git a/polly/test/ScopDetect/tlr_is_hoistable_load.ll b/polly/test/ScopDetect/tlr_is_hoistable_load.ll
index 5c33522f62325..24a3f55a519e2 100644
--- a/polly/test/ScopDetect/tlr_is_hoistable_load.ll
+++ b/polly/test/ScopDetect/tlr_is_hoistable_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-detect-full-functions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting -polly-detect-full-functions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; This testcase checks for compatibility of the -detect-full-functions
 ; flag in combination with the -invariant-load-hoisting option. More
diff --git a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
index 4ae86a940e0c8..e7245d80b60ed 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(int A[], int B[]) {
 ;  for (int i=0; i<42; i++)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
index adb14b5b017d4..2a0b281073f59 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Scop contains function entry (not yet supported).
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
index 428a7cf855f6e..fc4c1fbcef484 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; #define N 1024
 ; double invalidCall(double A[N]);
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
index 30e5fb9fdeba8..7a540d606eadf 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ;void foo(int a, int b) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
index 2bc515e0ae5e1..512366f1bc7ce 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Irreducible region encountered in control flow.
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index a96b64e4e0d54..e844aea24ac26 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -1,16 +1,6 @@
-; RUN: opt %loadNPMPolly \
-; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
-; RUN:     -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output \
-; RUN:     < %s 2>&1| FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly \
-; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
-; RUN:     -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output \
-; RUN:     < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-process-unprofitable=false \
-; RUN:     -polly-detect-track-failures -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=ALLOWNONAFFINEALL
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-process-unprofitable=false -polly-detect-track-failures -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEALL
 
 ; void f(int A[], int n) {
 ;   for (int i = 0; i < A[n+i]; i++)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index 6156efaea1909..d80911cc0ec9a 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -4,8 +4,8 @@
 ; the PostDominatorTree. Infinite loops are postdominated only by the virtual
 ; root, which causes them not to appear in regions in ScopDetection anymore.
 
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void func (int param0, int N, int *A)
 ; {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
index dd95bd6ede715..d8c2916cc23bb 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN-ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
 
 ;  1 void manyaccesses(float A[restrict], long n, float B[restrict][n])
 ;  2 {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
index 13ac9d5ace2d3..ee0aa743f434b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int A[]) {
 ;   for(int i=0; i<42; ++i)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
index 93e9e8b14038b..ad2c813c4b7ce 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
@@ -1,10 +1,6 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
-; RUN:     -polly-process-unprofitable=false < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false < %s 2>&1 | FileCheck %s
 
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
-; RUN:     -polly-process-unprofitable=false < %s 2>&1 -pass-remarks-output=%t.yaml
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false -pass-remarks-output=%t.yaml < %s 2>&1
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
index d110cfefc27dd..d97032c8f8eaf 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s \
-; RUN:     -pass-remarks-missed="polly-detect" 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
index 5f296fae9532b..7a5025c0c2fbe 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; struct b {
 ;   double **b;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
index 3cdeed13ec285..e15c045907ddf 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
 ;
 ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c
 ;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
index 4a9a200d67dfd..b5918d9f7a2d4 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
index 61ff033d9f934..502abf8dab6d7 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
diff --git a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
index c5efec3f50c58..accb562771819 100644
--- a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
+++ b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
@@ -1,5 +1,5 @@
 ; This should be run without alias analysis enabled.
-;RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>'  -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 define i32 @main() nounwind {
diff --git a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
index 81c7efb963652..57ae977a1a13f 100644
--- a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
+++ b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare void @foo()
diff --git a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
index 5abf8ff29ef85..3cb63cc4f952c 100644
--- a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
+++ b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
index d16ba453f9815..668fcd8fabcaf 100644
--- a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
+++ b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/Alias-0.ll b/polly/test/ScopInfo/Alias-0.ll
index ebbe744627ef8..50c1b65727eaf 100644
--- a/polly/test/ScopInfo/Alias-0.ll
+++ b/polly/test/ScopInfo/Alias-0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll
index b1711c25857d0..15fd6c936fc47 100644
--- a/polly/test/ScopInfo/Alias-1.ll
+++ b/polly/test/ScopInfo/Alias-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-2.ll b/polly/test/ScopInfo/Alias-2.ll
index b94f130c94ebd..598ad0fe8cf1c 100644
--- a/polly/test/ScopInfo/Alias-2.ll
+++ b/polly/test/ScopInfo/Alias-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-3.ll b/polly/test/ScopInfo/Alias-3.ll
index af7816546b4ab..388a2defec395 100644
--- a/polly/test/ScopInfo/Alias-3.ll
+++ b/polly/test/ScopInfo/Alias-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-4.ll b/polly/test/ScopInfo/Alias-4.ll
index fe651c87b241c..e9f4f95a9997f 100644
--- a/polly/test/ScopInfo/Alias-4.ll
+++ b/polly/test/ScopInfo/Alias-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll
index 0b69beaaf3f9c..d44c18cf49e36 100644
--- a/polly/test/ScopInfo/BoundChecks/single-loop.ll
+++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has been fixed.
 ;
diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll
index f2ba17d33c0ea..9034f75f13792 100644
--- a/polly/test/ScopInfo/BoundChecks/two-loops.ll
+++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has fixed.
 ; XFAIL: *
diff --git a/polly/test/ScopInfo/NonAffine/div_backedge.ll b/polly/test/ScopInfo/NonAffine/div_backedge.ll
index 3b0c673ece38b..e8edad9494075 100644
--- a/polly/test/ScopInfo/NonAffine/div_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/div_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 1;; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/div_domain.ll b/polly/test/ScopInfo/NonAffine/div_domain.ll
index 34a5cecdfe3df..c195bb42dac9f 100644
--- a/polly/test/ScopInfo/NonAffine/div_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/div_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 0; i < 16; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
index 7d02fae7f98f3..31ecdaa0ef3e4 100644
--- a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
+++ b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B, int *C) {
 ;      for (int i = 0; i < 1000; i++)
diff --git a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
index d5c808d9021f2..e0cd1e51a095c 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   { Stmt_for_body[i0] : 0 <= i0 <= 6 };
diff --git a/polly/test/ScopInfo/NonAffine/modulo_domain.ll b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
index 13fe53f11633d..53bbe15799e61 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: The new domain generation cannot handle modulo domain constraints,
 ;       hence modulo handling has been disabled completely. Once this is
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
index 2b8427d74ec84..7d34ef9644b5a 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT
 ;
 ; SCALAR:      Function: f
 ; SCALAR-NEXT: Region: %bb1---%bb13
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
index 30f756e81e474..a40afdde1237f 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
index 6dacd719862ef..f3678d3245f57 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
index 8a13f791ed6de..85a1081159d59 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 128; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
index 1e70d2c9db87e..65513a5d9d1fb 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_for_body
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
index dcfaa9280dcb8..0185774d6274c 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
index 24bfe60502163..ab47dc0b78260 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
index 931ad36d15f34..51a7d54562780 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
@@ -1,12 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-nonaffine-loops=true \
-; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s \
-; RUN:     --check-prefix=ALL
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Negative test for INNERMOST.
 ; At the moment we will optimistically assume A[i] in the conditional before the inner
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
index 37b51cebd74d5..b1f7e65e9dd25 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
@@ -1,16 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-nonaffine-loops=true \
-; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-process-unprofitable=false \
-; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-process-unprofitable=false -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Negative test for INNERMOST.
 ; At the moment we will optimistically assume A[i] in the conditional before the inner
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
index 7bfd7f86efcdb..ac77dfb7454d3 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
index fc779d544e62f..db08544aa559c 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false    '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION
 ;
 ;    void f(int *A, int *C) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
index 63ff354d7e5f7..cde2dc495d549 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Verify that we over approximate the read access of A[j] in the last statement as j is
 ; computed in a non-affine loop we do not model.
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
index d33befe2c66e0..ce4cc6189d45c 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, double A[], int INDEX[]) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
index 77c2df48d6514..b46ce87a45e2d 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
index 9ed340d1d304b..58e5ccd9b6e36 100644
--- a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
+++ b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; Regression test that triggered a memory leak at some point (24947).
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
index cbd024ba7a392..d94fc5f8a8823 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no alias group because we either access A or B never both.
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
index 3858d8a7bb1d6..df7f75dd8d95e 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we create two alias groups since the minimal/maximal accesses
 ; depend on %b.
diff --git a/polly/test/ScopInfo/aliasing_dead_access.ll b/polly/test/ScopInfo/aliasing_dead_access.ll
index 7baa3dce1f9db..0ebc39c0e5a78 100644
--- a/polly/test/ScopInfo/aliasing_dead_access.ll
+++ b/polly/test/ScopInfo/aliasing_dead_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not create a SCoP if there is no statement executed.
 ;
diff --git a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
index 7265aab22a490..8e5bab661e18c 100644
--- a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
+++ b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
@@ -1,8 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
-; RUN:                < %s 2>&1 | FileCheck %s --check-prefix=FOUND
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
-; RUN:                -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s \
-; RUN:                --check-prefix=IGNORED
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FOUND
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s --check-prefix=IGNORED
 ;
 ; FOUND: Function: foo
 ; IGNORED-NOT: Function: foo
diff --git a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll
index c7592bcb09fcf..aec6ea0bf1441 100644
--- a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll
+++ b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8
-; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7
+; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8
+; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7
 ;
 ; Check that we allow this SCoP even though it has 10 parameters involved in possibly aliasing accesses.
 ; However, only 7 are involved in accesses through B, 8 through C and none in accesses through A.
diff --git a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
index d66a10bc511b1..a7dbe0baeae5d 100644
--- a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
+++ b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: { : }
diff --git a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
index 9943802ec8595..db54a1687b4d5 100644
--- a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=         < %s 2>&1 | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=tbaa    < %s 2>&1 | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline= < %s 2>&1 | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline=tbaa < %s 2>&1 | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
index 900d5d40d96f5..0001b8adb41e1 100644
--- a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
+++ b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; @test1
 ; Make sure we generate the correct aliasing check for a fixed-size memset operation.
diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
index 70c3c56fb3112..93253b7e65d4a 100644
--- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
+++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
@@ -1,14 +1,9 @@
-; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting \
-; RUN: -polly-allow-dereference-of-all-function-parameters \
-; RUN: '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 
-; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
-; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC
 
 
-; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
-; RUN: -polly-allow-dereference-of-all-function-parameters \
-; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE
 
 ; SCOP:      Function: hoge
 ; SCOP-NEXT: Region: %bb15---%bb37
diff --git a/polly/test/ScopInfo/assume_gep_bounds.ll b/polly/test/ScopInfo/assume_gep_bounds.ll
index bd14e3868d525..994d49e5b887f 100644
--- a/polly/test/ScopInfo/assume_gep_bounds.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ;    void foo(float A[][20][30], long n, long m, long p) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/ScopInfo/assume_gep_bounds_2.ll b/polly/test/ScopInfo/assume_gep_bounds_2.ll
index 7a8c1870abe25..be43be598bd3d 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_2.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-precise-inbounds | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-inbounds < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float A[restrict][20], float B[restrict][20], long n, long m,
 ;             long p) {
diff --git a/polly/test/ScopInfo/assume_gep_bounds_many.ll b/polly/test/ScopInfo/assume_gep_bounds_many.ll
index 01fc12cd7f108..cfd9008741c3a 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_many.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_many.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' -polly-ignore-aliasing \
-; RUN:    < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing < %s 2>&1 | FileCheck %s
 
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: [n1_a, n1_b, n1_c, n1_d, n2_a, n2_b, n2_c, n2_d, n3_a, n3_b, n3_c, n3_d, n4_a, n4_b, n4_c, n4_d, n5_a, n5_b, n5_c, n5_d, n6_a, n6_b, n6_c, n6_d, n7_a, n7_b, n7_c, n7_d, n8_a, n8_b, n8_c, n8_d, n9_a, n9_b, n9_c, n9_d, p1_b, p1_c, p1_d, p2_b, p2_c, p2_d, p3_b, p3_c, p3_d, p4_b, p4_c, p4_d, p5_b, p5_c, p5_d, p6_b, p6_c, p6_d, p7_b, p7_c, p7_d, p8_b, p8_c, p8_d, p9_b, p9_c, p9_d] -> {  : p1_b >= n1_b and p1_c >= n1_c and p1_d >= n1_d and p2_b >= n2_b and p2_c >= n2_c and p2_d >= n2_d and p3_b >= n3_b and p3_c >= n3_c and p3_d >= n3_d and p4_b >= n4_b and p4_c >= n4_c and p4_d >= n4_d and p5_b >= n5_b and p5_c >= n5_c and p5_d >= n5_d and p6_b >= n6_b and p6_c >= n6_c and p6_d >= n6_d and p7_b >= n7_b and p7_c >= n7_c and p7_d >= n7_d and p8_b >= n8_b and p8_c >= n8_c and p8_d >= n8_d and p9_b >= n9_b and p9_c >= n9_c and p9_d >= n9_d }
diff --git a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
index 3fb7a1329c745..b3aa7686d3010 100644
--- a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
+++ b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do no introduce a parameter here that is actually not needed.
 ;
diff --git a/polly/test/ScopInfo/bool-addrec.ll b/polly/test/ScopInfo/bool-addrec.ll
index 81fcade08f65a..01c6d52c30f76 100644
--- a/polly/test/ScopInfo/bool-addrec.ll
+++ b/polly/test/ScopInfo/bool-addrec.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-ast>' -polly-process-unprofitable < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable < %s 2>&1 | FileCheck %s
 
 ; CHECK:      for (int c0 = 0; c0 <= 19999; c0 += 1) {
 ; CHECK-NEXT:   if (c0 % 2 == 0)
diff --git a/polly/test/ScopInfo/bounded_loop_assumptions.ll b/polly/test/ScopInfo/bounded_loop_assumptions.ll
index 5628092de7765..21ba391f4fc1a 100644
--- a/polly/test/ScopInfo/bounded_loop_assumptions.ll
+++ b/polly/test/ScopInfo/bounded_loop_assumptions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context is tricky here as the equality test for the inner loop
 ; allows an "unbounded" loop trip count. We assume that does not happen, thus
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
index 83743e4e4ecc7..d25a8e666b525 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
@@ -1,8 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | \
-; RUN:     FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
-; RUN:     FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOP
 
 ; DETECT: Valid Region for Scop: loop => barrier
 ; DETECT-NEXT: Valid Region for Scop: branch => end
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
index 9685ba37a49a1..91aa96e0f3501 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
@@ -1,8 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
-; RUN:     FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
-; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
-; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE:      Statements {
 ; NONAFFINE-NEXT: 	Stmt_loop
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
index f41e6500fb30a..22a60c764eb4d 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
@@ -1,8 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | \
-; RUN:     FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
-; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
-; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE-NOT: Statements
 
diff --git a/polly/test/ScopInfo/bug_2010_10_22.ll b/polly/test/ScopInfo/bug_2010_10_22.ll
index 71e7051922b53..1d248891dfd09 100644
--- a/polly/test/ScopInfo/bug_2010_10_22.ll
+++ b/polly/test/ScopInfo/bug_2010_10_22.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/bug_2011_1_5.ll b/polly/test/ScopInfo/bug_2011_1_5.ll
index f4a24e06f46ae..7c76c3eaa565a 100644
--- a/polly/test/ScopInfo/bug_2011_1_5.ll
+++ b/polly/test/ScopInfo/bug_2011_1_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 ; Bug description: Alias Analysis thinks IntToPtrInst aliases with alloca instructions created by IndependentBlocks Pass.
 ;                  This will trigger the assertion when we are verifying the SCoP after IndependentBlocks.
diff --git a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
index ed6bbafdac1f0..6e1ef2339a81d 100644
--- a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
+++ b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @edge.8265 = external global [72 x i32], align 32 ; <ptr> [#uses=1]
diff --git a/polly/test/ScopInfo/cfg_consequences.ll b/polly/test/ScopInfo/cfg_consequences.ll
index 9161d3db4167a..2b702e235ca6c 100644
--- a/polly/test/ScopInfo/cfg_consequences.ll
+++ b/polly/test/ScopInfo/cfg_consequences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void consequences(int *A, int bool_cond, int lhs, int rhs) {
 ;
diff --git a/polly/test/ScopInfo/complex-branch-structure.ll b/polly/test/ScopInfo/complex-branch-structure.ll
index de79c2226e68d..f48089afb93b9 100644
--- a/polly/test/ScopInfo/complex-branch-structure.ll
+++ b/polly/test/ScopInfo/complex-branch-structure.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop of the following form to check that the domain construction
 ; does not take a huge amount of time, but that we instead just bail out.
diff --git a/polly/test/ScopInfo/complex-condition.ll b/polly/test/ScopInfo/complex-condition.ll
index c3b8d2bb0ef88..9164959c1f6dc 100644
--- a/polly/test/ScopInfo/complex-condition.ll
+++ b/polly/test/ScopInfo/complex-condition.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption: {  : false }
 ;
diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll
index 4a2a1d2a64a6d..456edb04e0c2b 100644
--- a/polly/test/ScopInfo/complex-expression.ll
+++ b/polly/test/ScopInfo/complex-expression.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This test case has an SCEVSMax expression with a very high arity. The
 ; piecewise affine function we would create for it would have a huge amount of
diff --git a/polly/test/ScopInfo/complex-loop-nesting.ll b/polly/test/ScopInfo/complex-loop-nesting.ll
index 36cb078f19fff..4ffd8689f1a4a 100644
--- a/polly/test/ScopInfo/complex-loop-nesting.ll
+++ b/polly/test/ScopInfo/complex-loop-nesting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/complex-successor-structure-2.ll b/polly/test/ScopInfo/complex-successor-structure-2.ll
index f4a78bf753853..32425d7598bc9 100644
--- a/polly/test/ScopInfo/complex-successor-structure-2.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-2.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop for the region for.body->B13. The CFG is of the following
 ; form and the branch conditions are build from "smax" SCEVs. However, in
diff --git a/polly/test/ScopInfo/complex-successor-structure-3.ll b/polly/test/ScopInfo/complex-successor-structure-3.ll
index 6da1fe3a8b9f3..c01eca534bcf1 100644
--- a/polly/test/ScopInfo/complex-successor-structure-3.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-3.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; Check that propagation of domains from A(X) to A(X+1) will keep the
 ; domains small and concise.
diff --git a/polly/test/ScopInfo/complex-successor-structure.ll b/polly/test/ScopInfo/complex-successor-structure.ll
index 6c87ba3e98505..1b39f4cf192eb 100644
--- a/polly/test/ScopInfo/complex-successor-structure.ll
+++ b/polly/test/ScopInfo/complex-successor-structure.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop from the region for.body->B13. The CFG is of the
 ; following form. The test checks that the condition construction does not take
diff --git a/polly/test/ScopInfo/complex_domain_binary_condition.ll b/polly/test/ScopInfo/complex_domain_binary_condition.ll
index 6e28c9dfee06a..42a114eaa6ec1 100644
--- a/polly/test/ScopInfo/complex_domain_binary_condition.ll
+++ b/polly/test/ScopInfo/complex_domain_binary_condition.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption: {  : false }
 ;
diff --git a/polly/test/ScopInfo/complex_execution_context.ll b/polly/test/ScopInfo/complex_execution_context.ll
index 9880a1dd67d19..9896fba8904b8 100644
--- a/polly/test/ScopInfo/complex_execution_context.ll
+++ b/polly/test/ScopInfo/complex_execution_context.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption:
 ;
diff --git a/polly/test/ScopInfo/cond_constant_in_loop.ll b/polly/test/ScopInfo/cond_constant_in_loop.ll
index 552fddc6ff08c..ecc2767fd6ecd 100644
--- a/polly/test/ScopInfo/cond_constant_in_loop.ll
+++ b/polly/test/ScopInfo/cond_constant_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/cond_in_loop.ll b/polly/test/ScopInfo/cond_in_loop.ll
index c06dcd955bac1..0f31904133719 100644
--- a/polly/test/ScopInfo/cond_in_loop.ll
+++ b/polly/test/ScopInfo/cond_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/condition-after-error-block-2.ll b/polly/test/ScopInfo/condition-after-error-block-2.ll
index 8c4b2170ad69b..257b2ede236d9 100644
--- a/polly/test/ScopInfo/condition-after-error-block-2.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we do not allow PHI nodes such as %phi, if they reference an error
 ; block and are used by anything else than a terminator instruction.
diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
index d5069da916fa1..d86b48ed24963 100644
--- a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/condtion-after-error-block.ll b/polly/test/ScopInfo/condtion-after-error-block.ll
index d9de4fc40a208..8ad98b4a4a78e 100644
--- a/polly/test/ScopInfo/condtion-after-error-block.ll
+++ b/polly/test/ScopInfo/condtion-after-error-block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we allow scops containing uniform branch conditions, where all
 ; but one incoming block comes from an error condition.
diff --git a/polly/test/ScopInfo/const_srem_sdiv.ll b/polly/test/ScopInfo/const_srem_sdiv.ll
index b4c2f119fe053..b50c4bd910dda 100644
--- a/polly/test/ScopInfo/const_srem_sdiv.ll
+++ b/polly/test/ScopInfo/const_srem_sdiv.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; See http://research.microsoft.com/pubs/151917/divmodnote-letter.pdf
 ;
diff --git a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
index 86dd94e3371b2..f09f82f32c93a 100644
--- a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
+++ b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; At some point this caused a problem in the domain generation as we
 ; assumed any constant branch condition to be valid. However, only constant
diff --git a/polly/test/ScopInfo/constant_factor_in_parameter.ll b/polly/test/ScopInfo/constant_factor_in_parameter.ll
index b58d413e074e7..26c73bd72271b 100644
--- a/polly/test/ScopInfo/constant_factor_in_parameter.ll
+++ b/polly/test/ScopInfo/constant_factor_in_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constant part of the N * M * 4 expression is not part of the
 ; parameter but explicit in the access function. This can avoid existentially
diff --git a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
index 62e6cd4641de1..762132f9edd78 100644
--- a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
+++ b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/polly/test/ScopInfo/constant_start_integer.ll b/polly/test/ScopInfo/constant_start_integer.ll
index 8991f8250f0b7..6d17288b28227 100644
--- a/polly/test/ScopInfo/constant_start_integer.ll
+++ b/polly/test/ScopInfo/constant_start_integer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(float *input) {
diff --git a/polly/test/ScopInfo/debug_call.ll b/polly/test/ScopInfo/debug_call.ll
index a6761ecebe6a7..63c1baca5accc 100644
--- a/polly/test/ScopInfo/debug_call.ll
+++ b/polly/test/ScopInfo/debug_call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the call to dbg_printf is accepted as a debug-function.
 ;
diff --git a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
index 676c8a27e5749..7126fb95cd00c 100644
--- a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
+++ b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
 ;   for (long i = 0; i < n-3; i++)
diff --git a/polly/test/ScopInfo/div_by_zero.ll b/polly/test/ScopInfo/div_by_zero.ll
index aecd16833b84e..62a13de7ceac0 100644
--- a/polly/test/ScopInfo/div_by_zero.ll
+++ b/polly/test/ScopInfo/div_by_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
index a3ca59563ab1f..333175b417ade 100644
--- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
+++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 ; Check that we do not crash on this input. Earlier this indeed crashed as
 ; we tried to model the access functions in an error block.
diff --git a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
index a988b3f8c2b01..b111851939d06 100644
--- a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
+++ b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
index 9a1edcbfb7796..3ee6ff7889c84 100644
--- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll
+++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; Check we do not crash.
 ;
diff --git a/polly/test/ScopInfo/error-blocks-1.ll b/polly/test/ScopInfo/error-blocks-1.ll
index 047b095a95947..902ea15752980 100644
--- a/polly/test/ScopInfo/error-blocks-1.ll
+++ b/polly/test/ScopInfo/error-blocks-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Context:
 ; CHECK-NEXT:    [N] -> {  : -2147483648 <= N <= 2147483647 }
diff --git a/polly/test/ScopInfo/error-blocks-2.ll b/polly/test/ScopInfo/error-blocks-2.ll
index 6fa12947540c0..613b00a1a9ba7 100644
--- a/polly/test/ScopInfo/error-blocks-2.ll
+++ b/polly/test/ScopInfo/error-blocks-2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/error-blocks-3.ll b/polly/test/ScopInfo/error-blocks-3.ll
index e7643601356db..9521037888075 100644
--- a/polly/test/ScopInfo/error-blocks-3.ll
+++ b/polly/test/ScopInfo/error-blocks-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ; The instruction
 ;
diff --git a/polly/test/ScopInfo/escaping_empty_scop.ll b/polly/test/ScopInfo/escaping_empty_scop.ll
index 2efaef3fb99b8..d47b2865b4ee0 100644
--- a/polly/test/ScopInfo/escaping_empty_scop.ll
+++ b/polly/test/ScopInfo/escaping_empty_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void g();
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/exit-phi-1.ll b/polly/test/ScopInfo/exit-phi-1.ll
index cbd6c280e8caa..21f13cf4f4e4d 100644
--- a/polly/test/ScopInfo/exit-phi-1.ll
+++ b/polly/test/ScopInfo/exit-phi-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Check for correct code generation of exit PHIs, even if the same PHI value
 ; is used again inside the the SCoP.
diff --git a/polly/test/ScopInfo/exit-phi-2.ll b/polly/test/ScopInfo/exit-phi-2.ll
index 695c617b14c1f..b8da9ab5b64f9 100644
--- a/polly/test/ScopInfo/exit-phi-2.ll
+++ b/polly/test/ScopInfo/exit-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no MK_ExitPHI READ access.
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses-2.ll b/polly/test/ScopInfo/exit_phi_accesses-2.ll
index b3b7cb1c65993..928b564c7cef5 100644
--- a/polly/test/ScopInfo/exit_phi_accesses-2.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: Function: foo
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses.ll b/polly/test/ScopInfo/exit_phi_accesses.ll
index 77b038ec8e4af..a54ca4a185ae2 100644
--- a/polly/test/ScopInfo/exit_phi_accesses.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that PHI nodes only create PHI access and nothing else (e.g. unnecessary
 ; SCALAR accesses). In this case, for a PHI in the exit node, hence there is no
diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll
index 95212f83acdca..c0d2dcd16289d 100644
--- a/polly/test/ScopInfo/expensive-boundary-context.ll
+++ b/polly/test/ScopInfo/expensive-boundary-context.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
-; RUN:                < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT:   Assumed Context:
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
index 5e833e7ae0f4f..2f446b630168a 100644
--- a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
+++ b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 ; CHECK: Valid Region for Scop: bb10 => bb16
 
diff --git a/polly/test/ScopInfo/full-function.ll b/polly/test/ScopInfo/full-function.ll
index 596c3d0af66a9..20cb137181697 100644
--- a/polly/test/ScopInfo/full-function.ll
+++ b/polly/test/ScopInfo/full-function.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-detect-full-functions < %s 2>&1 \
-; RUN: | FileCheck %s -check-prefix=FULL
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s -check-prefix=WITHOUT-FULL
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s -check-prefix=FULL
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=WITHOUT-FULL
 
 ; FULL:      Region: %bb---FunctionExit
 ; FULL:      Statements {
diff --git a/polly/test/ScopInfo/granularity_same_name.ll b/polly/test/ScopInfo/granularity_same_name.ll
index 17f75fbf8a979..638b09879ce39 100644
--- a/polly/test/ScopInfo/granularity_same_name.ll
+++ b/polly/test/ScopInfo/granularity_same_name.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
 ;
 ; Check that the statement has the same name, regardless of how the
 ; basic block is split into multiple statements.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll
index 5c4484f9d4579..f4d864d2c6543 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has the instructions of the two statements interleaved, such that
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
index 7ae0d961b38fb..f2c37f6293d62 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
index 7839e51c163ae..f7bd882da96e2 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
index 8643e85e05593..80aa9fb6deb7c 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has an independent statement just for PHI writes.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
index bc71cbe45cd98..66ef9fa9429e9 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the PHI Write of value that is defined in the same basic
 ; block is in the statement where it is defined.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
index f3864bac519b9..3837219e5d818 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case has no explicit epilogue for PHI writes because it would
 ; have a scalar dependency to the previous statement.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
index 43101a8a0abfc..c43ad76d079d8 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case should be split into two statements because {X[0], Y[0]}
 ; and {A[0], B[0]} do not intersect.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
index 4974f7e9b28ca..cfa7739d743f7 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case cannot be split into two statements because the order of
 ; loads and store would be violated.
diff --git a/polly/test/ScopInfo/i1_params.ll b/polly/test/ScopInfo/i1_params.ll
index be3e287372017..cf5b533c02682 100644
--- a/polly/test/ScopInfo/i1_params.ll
+++ b/polly/test/ScopInfo/i1_params.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that both a signed as well as an unsigned extended i1 parameter
 ; is represented correctly.
diff --git a/polly/test/ScopInfo/infeasible-rtc.ll b/polly/test/ScopInfo/infeasible-rtc.ll
index 7a0bfe0fa4d84..9221ddf5fc910 100644
--- a/polly/test/ScopInfo/infeasible-rtc.ll
+++ b/polly/test/ScopInfo/infeasible-rtc.ll
@@ -1,8 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
-; RUN:  | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/infeasible_invalid_context.ll b/polly/test/ScopInfo/infeasible_invalid_context.ll
index 006901ab05b79..7ab6477460721 100644
--- a/polly/test/ScopInfo/infeasible_invalid_context.ll
+++ b/polly/test/ScopInfo/infeasible_invalid_context.ll
@@ -1,8 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
-; RUN:  | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
 
 ; DETECT: Valid Region for Scop: if.end116 => for.inc216
 ; SCOPS-NOT: Statements
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll
index 578015aeecdc5..adefe794561c2 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
index 627524c0327dd..a88fcdc0f9b12 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *B, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/integers.ll b/polly/test/ScopInfo/integers.ll
index 4f6d1117e2bcc..5f89243be0e3b 100644
--- a/polly/test/ScopInfo/integers.ll
+++ b/polly/test/ScopInfo/integers.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we correctly convert integers to isl values.
 
diff --git a/polly/test/ScopInfo/inter-error-bb-dependence.ll b/polly/test/ScopInfo/inter-error-bb-dependence.ll
index 761fcbbe3435e..0829f34be9791 100644
--- a/polly/test/ScopInfo/inter-error-bb-dependence.ll
+++ b/polly/test/ScopInfo/inter-error-bb-dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 > /dev/null | FileCheck %s
 ;
 ; Error statements (%bb33) do not require their uses to be verified.
 ; In this case it uses %tmp32 from %bb31 which is not available because
diff --git a/polly/test/ScopInfo/inter_bb_scalar_dep.ll b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
index 7313618b082bc..f6406640dd2d8 100644
--- a/polly/test/ScopInfo/inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
index d2ed3c17fe9dd..3150204cd9549 100644
--- a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
+++ b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN:     < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop__TO__backedge
diff --git a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
index b3286cd2a7240..b0b63658caa55 100644
--- a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
index 86855e7499a51..0ef6b2d35106b 100644
--- a/polly/test/ScopInfo/intra_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll
index e6d9e733e35bf..e17d06f753a21 100644
--- a/polly/test/ScopInfo/intrinsics.ll
+++ b/polly/test/ScopInfo/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we remove the ignored intrinsics from the instruction list.
 ;
diff --git a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
index 723942668d8c2..d3439d8d33662 100644
--- a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
+++ b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; This crashed at some point as we place %1 and %4 in the same equivalence class
 ; for invariant loads and when we remap SCEVs to use %4 instead of %1 AddRec SCEVs
diff --git a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
index c493c22af32d9..ff5b0f601d03f 100644
--- a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
+++ b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; Check that no invalidated iterator is accessed while elements from
 ; the list of MemoryAccesses are removed.
diff --git a/polly/test/ScopInfo/invariant-load-instlist.ll b/polly/test/ScopInfo/invariant-load-instlist.ll
index ecb80e4054c35..1ec36e6d9d1b9 100644
--- a/polly/test/ScopInfo/invariant-load-instlist.ll
+++ b/polly/test/ScopInfo/invariant-load-instlist.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 ; The load is a required invariant load and at the same time used in a store.
 ; Polly used to add two MemoryAccesses for it which caused an assertion to fail.
diff --git a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
index 89eac6ce69a11..2d14287d4df44 100644
--- a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
+++ b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT: 	Stmt_L_4
diff --git a/polly/test/ScopInfo/invariant_load.ll b/polly/test/ScopInfo/invariant_load.ll
index 9dc064276c40f..8974b7f7fb8cb 100644
--- a/polly/test/ScopInfo/invariant_load.ll
+++ b/polly/test/ScopInfo/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
index 40aa3098683b3..7b5a7591813a6 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
index 287676024079c..0c2f57dfcb1c3 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
index cb745b4920b82..865bd789db6fb 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
index fa5429d4803a8..f63fe9cc1f7c6 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_addrec_sum.ll b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
index 2e639f7d5e331..e70aa80ae6009 100644
--- a/polly/test/ScopInfo/invariant_load_addrec_sum.ll
+++ b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %entry.split---%if.end
 ; CHECK:     Invariant Accesses: {
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer.ll b/polly/test/ScopInfo/invariant_load_base_pointer.ll
index f2539af97a0b7..1176d1ca9db85 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
index f854b1f48ea92..81fd3b9559f43 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
index 5a9c5c6cabbe6..7313176aceed7 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_branch_condition.ll b/polly/test/ScopInfo/invariant_load_branch_condition.ll
index d12750c30ba98..f6cadffe311e8 100644
--- a/polly/test/ScopInfo/invariant_load_branch_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_branch_condition.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN:     -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
index 34d50a18663c4..76cc55767caca 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Stmt_body1
 ; CHECK-NEXT:       Domain :=
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
index 51f3cf6c095ac..9cc9391b6bc25 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; Make sure we choose a canonical element that is not the first invariant load,
 ; but the first that is an array base pointer.
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
index 3a742bbccdf19..7f609f9a54689 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; Verify that we canonicalize accesses even tough one of the accesses (even
 ; the canonical base) has a partial execution context. This is correct as
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
index 6bd8b3146e871..216e0760987cd 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; Verify that a delinearized and a not delinearized access are not
 ; canonicalized.
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
index cb7e5646fc2b0..5da3d0ceb2d0f 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; Verify that two arrays delinearized with different sizes are not coalesced.
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
index 6f7fbacc089cb..b71a092a2d468 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; Verify that arrays with different element types are not coalesced.
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
index 445832822bdf0..2c4683ea5ce96 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
-; RUN:  -polly-invariant-load-hoisting \
-; RUN:  | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; Verify that nested arrays with invariant base pointers are handled correctly.
 ; Specifically, we currently do not canonicalize arrays where some accesses are
diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll
index 11e7088d68dbd..e6ea032004a96 100644
--- a/polly/test/ScopInfo/invariant_load_complex_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/invariant_load_condition.ll b/polly/test/ScopInfo/invariant_load_condition.ll
index c7d7b3c9ba611..8b1dc8be87c86 100644
--- a/polly/test/ScopInfo/invariant_load_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_dereferenceable.ll b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
index 526bdc6ddb3bd..fc5527c48c411 100644
--- a/polly/test/ScopInfo/invariant_load_dereferenceable.ll
+++ b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true \
-; RUN: -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect;scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Function: foo_undereferanceable
 
diff --git a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
index eb148063320e7..b5525a8e2639e 100644
--- a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
+++ b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not consolidate the invariant loads to smp[order - 1] and
 ; smp[order - 2] in the blocks %0 and %16. While they have the same pointer
diff --git a/polly/test/ScopInfo/invariant_load_in_non_affine.ll b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
index 5261113f5a0cf..69a7932fd3f58 100644
--- a/polly/test/ScopInfo/invariant_load_in_non_affine.ll
+++ b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output \
-; RUN:   -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop
 ;
diff --git a/polly/test/ScopInfo/invariant_load_loop_ub.ll b/polly/test/ScopInfo/invariant_load_loop_ub.ll
index ee889e6c4d5a1..9258d75f6e294 100644
--- a/polly/test/ScopInfo/invariant_load_loop_ub.ll
+++ b/polly/test/ScopInfo/invariant_load_loop_ub.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
index 6af7caecc0b37..50b0103b73efb 100644
--- a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing \
-; RUN:                -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Note: The order of the invariant accesses is important because A is the
 ;       base pointer of tmp3 and we will generate code in the same order as
diff --git a/polly/test/ScopInfo/invariant_load_scalar_dep.ll b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
index 319f24bdcb920..ae1423e1e5f05 100644
--- a/polly/test/ScopInfo/invariant_load_scalar_dep.ll
+++ b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_stmt_domain.ll b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
index 715948062c055..8062d875b1174 100644
--- a/polly/test/ScopInfo/invariant_load_stmt_domain.ll
+++ b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that the statement domain of the invariant access
 ; is the universe. In earlier versions of Polly, we accidentally computed an
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
index a6108320d5608..9ee4a54168a68 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Stress test for the code generation of invariant accesses.
 ;
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter.ll b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
index e3c183aab5e26..5bd2c51d86fa6 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    void f(int *I0, int *I1, int *V) {
 ;      for (int i = 0; i < 1000; i++) {
diff --git a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
index b5168e912ed74..426c14c191dd1 100644
--- a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
+++ b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; CHECK: Execution Context: [p_0_loaded_from_currpc] -> {  :  }
 ;
diff --git a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
index 85360821078dc..77f74df7d7b21 100644
--- a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
index 134eac22bff5c..f18534d5bee24 100644
--- a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Negative test. If we assume UB[*V] to be invariant we get a cyclic
 ; dependence in the invariant loads that needs to be resolved by
diff --git a/polly/test/ScopInfo/invariant_loop_bounds.ll b/polly/test/ScopInfo/invariant_loop_bounds.ll
index f22199cfe4942..dcf7f50eb27c4 100644
--- a/polly/test/ScopInfo/invariant_loop_bounds.ll
+++ b/polly/test/ScopInfo/invariant_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
index e3292b4e4aefa..df5798638ba7c 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occur in the region but actually access the same
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
index d69438de5817f..3d8c232c75970 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occur in the region but actually access the same
diff --git a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
index 2df96faf76249..965531f20b01d 100644
--- a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
+++ b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s 2>&1
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s 2>&1
 
 ; Used to fail with:
 ; ../../isl/isl_aff.c:591: position out of bounds
diff --git a/polly/test/ScopInfo/isl_trip_count_01.ll b/polly/test/ScopInfo/isl_trip_count_01.ll
index 480b6e9574a66..79621ce64bbcc 100644
--- a/polly/test/ScopInfo/isl_trip_count_01.ll
+++ b/polly/test/ScopInfo/isl_trip_count_01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: [M, N] -> { Stmt_while_body[i0] : i0 > 0 and 4i0 <= -M + N; Stmt_while_body[0] };
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_02.ll b/polly/test/ScopInfo/isl_trip_count_02.ll
index b78fb838edd0f..3052299277844 100644
--- a/polly/test/ScopInfo/isl_trip_count_02.ll
+++ b/polly/test/ScopInfo/isl_trip_count_02.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: We do not allow unbounded loops at the moment.
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_03.ll b/polly/test/ScopInfo/isl_trip_count_03.ll
index 96df05f89bcff..52fde263d6898 100644
--- a/polly/test/ScopInfo/isl_trip_count_03.ll
+++ b/polly/test/ScopInfo/isl_trip_count_03.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Test comes from a bug (15771) or better a feature request. It was not allowed
 ; in Polly in the old domain generation as ScalarEvolution cannot figure out the
diff --git a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
index fd310ececaa38..657b8f6dc64e1 100644
--- a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/licm_load.ll b/polly/test/ScopInfo/licm_load.ll
index ade640976d007..8f1cf4fa8fd91 100644
--- a/polly/test/ScopInfo/licm_load.ll
+++ b/polly/test/ScopInfo/licm_load.ll
@@ -1,7 +1,4 @@
-; RUN: opt %loadNPMPolly -passes='loop(loop-rotate,indvars),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s
-; RUN: opt %loadNPMPolly -passes='loop-mssa(loop-rotate,indvars,licm),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(int n, float A[static const restrict n],
 ;             float B[static const restrict n], int j) {
@@ -14,26 +11,30 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define void @foo(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %j) {
 entry:
   %tmp = sext i32 %n to i64
-  br label %for.cond
+  %cmp1 = icmp slt i64 0, %tmp
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
 
-for.cond:                                         ; preds = %for.inc, %entry
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
-  %cmp = icmp slt i64 %indvars.iv, %tmp
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
+for.body.lr.ph:                                   ; preds = %entry
   %idxprom = sext i32 %j to i64
   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
   %tmp2 = load i32, ptr %arrayidx, align 4
-  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %indvars.iv2 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv2
   store i32 %tmp2, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  br label %for.cond
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %tmp
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
 
-for.end:                                          ; preds = %for.cond
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
   ret void
 }
 
diff --git a/polly/test/ScopInfo/licm_potential_store.ll b/polly/test/ScopInfo/licm_potential_store.ll
index 8a36ee84313a2..cbd8e410ed7c8 100644
--- a/polly/test/ScopInfo/licm_potential_store.ll
+++ b/polly/test/ScopInfo/licm_potential_store.ll
@@ -1,10 +1,4 @@
-; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,polly-prepare,print<polly-function-scops>' \
-; RUN:    -tailcallopt -disable-output < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=NOLICM
-
-; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,loop-mssa(licm),polly-prepare,print<polly-function-scops>' \
-; RUN:    -tailcallopt -disable-output < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=LICM
+; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NOLICM
 
 ;    void foo(int n, float A[static const restrict n], float x) {
 ;      //      (0)
@@ -17,67 +11,40 @@
 ;      // (4)
 ;    }
 
-; LICM:   Statements
 ; NOLICM: Statements
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo(i32 %n, ptr noalias nonnull %A, float %x) {
 entry:
-  %n.addr = alloca i32, align 4
-  %A.addr = alloca ptr, align 8
-  %x.addr = alloca float, align 4
-  %i = alloca i32, align 4
-  %j = alloca i32, align 4
-  store i32 %n, ptr %n.addr, align 4
-  store ptr %A, ptr %A.addr, align 8
-  store float %x, ptr %x.addr, align 4
-  %tmp = load i32, ptr %n.addr, align 4
-  %tmp1 = zext i32 %tmp to i64
-  store i32 0, ptr %i, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.inc.4, %entry
-  %tmp2 = load i32, ptr %i, align 4
-  %cmp = icmp slt i32 %tmp2, 5
-  br i1 %cmp, label %for.body, label %for.end.6
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
+  %0 = add nuw i32 %smax, 1
+  br label %for.cond.1.preheader
 
-for.body:                                         ; preds = %for.cond
-  store i32 0, ptr %j, align 4
+for.cond.1.preheader:                             ; preds = %entry, %for.end
+  %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ]
+  %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ]
   br label %for.cond.1
 
-for.cond.1:                                       ; preds = %for.inc, %for.body
-  %tmp3 = load i32, ptr %j, align 4
-  %tmp4 = load i32, ptr %n.addr, align 4
-  %cmp2 = icmp slt i32 %tmp3, %tmp4
-  br i1 %cmp2, label %for.body.3, label %for.end
-
-for.body.3:                                       ; preds = %for.cond.1
-  store float 7.000000e+00, ptr %x.addr, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body.3
-  %tmp5 = load i32, ptr %j, align 4
-  %add = add nsw i32 %tmp5, 1
-  store i32 %add, ptr %j, align 4
-  br label %for.cond.1
+for.cond.1:                                       ; preds = %for.cond.1, %for.cond.1.preheader
+  %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ]
+  %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ]
+  %add = add nuw i32 %j.0, 1
+  %exitcond = icmp ne i32 %add, %0
+  br i1 %exitcond, label %for.cond.1, label %for.end
 
 for.end:                                          ; preds = %for.cond.1
-  %tmp6 = load float, ptr %x.addr, align 4
-  %tmp7 = load ptr, ptr %A.addr, align 8
-  store float %tmp6, ptr %tmp7, align 4
-  br label %for.inc.4
-
-for.inc.4:                                        ; preds = %for.end
-  %tmp8 = load i32, ptr %i, align 4
-  %add5 = add nsw i32 %tmp8, 1
-  store i32 %add5, ptr %i, align 4
-  br label %for.cond
+  %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ]
+  store float %x.addr.1.lcssa, ptr %A, align 4
+  %add5 = add nuw nsw i32 %i.05, 1
+  %exitcond6 = icmp ne i32 %add5, 5
+  br i1 %exitcond6, label %for.cond.1.preheader, label %for.end.6
 
-for.end.6:                                        ; preds = %for.cond
+for.end.6:                                        ; preds = %for.end
   ret void
 }
 
-; CHECK: Statements {
-; CHECK:     Stmt_for_end
-; CHECK: }
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.smax.i32(i32, i32) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }    
diff --git a/polly/test/ScopInfo/licm_potential_store_mssa.ll b/polly/test/ScopInfo/licm_potential_store_mssa.ll
new file mode 100644
index 0000000000000..ce785d622fcb3
--- /dev/null
+++ b/polly/test/ScopInfo/licm_potential_store_mssa.ll
@@ -0,0 +1,50 @@
+; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=LICM
+
+;    void foo(int n, float A[static const restrict n], float x) {
+;      //      (0)
+;      for (int i = 0; i < 5; i += 1) {
+;        for (int j = 0; j < n; j += 1) {
+;          x = 7; // (1)
+;        }
+;        A[0] = x; // (3)
+;      }
+;      // (4)
+;    }
+
+; LICM:   Statements
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i32 %n, ptr noalias nonnull %A, float %x) {
+entry:
+  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
+  br label %for.cond.1.preheader
+
+for.cond.1.preheader:                             ; preds = %for.end, %entry
+  %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ]
+  %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ]
+  br label %for.cond.1
+
+for.cond.1:                                       ; preds = %for.cond.1, %for.cond.1.preheader
+  %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ]
+  %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ]
+  %add = add nuw i32 %j.0, 1
+  %exitcond.not = icmp eq i32 %j.0, %smax
+  br i1 %exitcond.not, label %for.end, label %for.cond.1
+
+for.end:                                          ; preds = %for.cond.1
+  %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ]
+  %add5 = add nuw nsw i32 %i.05, 1
+  %exitcond6.not = icmp eq i32 %add5, 5
+  br i1 %exitcond6.not, label %for.end.6, label %for.cond.1.preheader
+
+for.end.6:                                        ; preds = %for.end
+  %x.addr.1.lcssa.lcssa = phi float [ %x.addr.1.lcssa, %for.end ]
+  store float %x.addr.1.lcssa.lcssa, ptr %A, align 4
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.smax.i32(i32, i32) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }   
diff --git a/polly/test/ScopInfo/licm_reduction_nested.ll b/polly/test/ScopInfo/licm_reduction_nested.ll
index c1676033fa909..50625b2ddabde 100644
--- a/polly/test/ScopInfo/licm_reduction_nested.ll
+++ b/polly/test/ScopInfo/licm_reduction_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars       -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars       '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; XFAIL: *
 ;
diff --git a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
index f102518da5261..8225bd04fce63 100644
--- a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
+++ b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 
 ; Verify that the compilation of this test case does not take infinite time.
 ; At some point Polly tried to model this test case and got stuck in
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
index e32748a4bbb57..064a0d3e700b9 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
index b32b87b5c3f3a..edaadd61dc020 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
index 431c907857fec..391f0ec8c0f59 100644
--- a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
+++ b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/loop_affine_bound_0.ll b/polly/test/ScopInfo/loop_affine_bound_0.ll
index 918d4099740ce..fcd56613fc095 100644
--- a/polly/test/ScopInfo/loop_affine_bound_0.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_1.ll b/polly/test/ScopInfo/loop_affine_bound_1.ll
index 8f7a87f1c5ac4..392509871a9b7 100644
--- a/polly/test/ScopInfo/loop_affine_bound_1.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[][128], long N, long M) {
 ;  long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_2.ll b/polly/test/ScopInfo/loop_affine_bound_2.ll
index 2d9f997a0767f..665dc1ad244d9 100644
--- a/polly/test/ScopInfo/loop_affine_bound_2.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_carry.ll b/polly/test/ScopInfo/loop_carry.ll
index 20ebbfbc8b49c..579f43d874577 100644
--- a/polly/test/ScopInfo/loop_carry.ll
+++ b/polly/test/ScopInfo/loop_carry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/many-scalar-dependences.ll b/polly/test/ScopInfo/many-scalar-dependences.ll
index 5b003325ef0fb..ddad36065a5c8 100644
--- a/polly/test/ScopInfo/many-scalar-dependences.ll
+++ b/polly/test/ScopInfo/many-scalar-dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float a[100][100]) {
 ;      float x;
diff --git a/polly/test/ScopInfo/max-loop-depth.ll b/polly/test/ScopInfo/max-loop-depth.ll
index 71e9c02aa8dcc..f33933210247d 100644
--- a/polly/test/ScopInfo/max-loop-depth.ll
+++ b/polly/test/ScopInfo/max-loop-depth.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void bar();
 ;    void foo(int *A, int *B, long int N, long int M) {
diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll
index 6c45b0d41b76b..149a2fcfea772 100644
--- a/polly/test/ScopInfo/memcpy-raw-source.ll
+++ b/polly/test/ScopInfo/memcpy-raw-source.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; Ensure that ScopInfo's alias analysis llvm.memcpy for,
 ; like the AliasSetTracker, preserves bitcasts.
diff --git a/polly/test/ScopInfo/memcpy.ll b/polly/test/ScopInfo/memcpy.ll
index 95c455f097b21..6b7a9e2edffbe 100644
--- a/polly/test/ScopInfo/memcpy.ll
+++ b/polly/test/ScopInfo/memcpy.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memmove.ll b/polly/test/ScopInfo/memmove.ll
index 8ff471a11cd17..aba886b59d1d5 100644
--- a/polly/test/ScopInfo/memmove.ll
+++ b/polly/test/ScopInfo/memmove.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset.ll b/polly/test/ScopInfo/memset.ll
index 89b0487728210..7eaec7bd1ad6a 100644
--- a/polly/test/ScopInfo/memset.ll
+++ b/polly/test/ScopInfo/memset.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset_null.ll b/polly/test/ScopInfo/memset_null.ll
index 9755cf1129e68..7bd3e90b3aa82 100644
--- a/polly/test/ScopInfo/memset_null.ll
+++ b/polly/test/ScopInfo/memset_null.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S -passes=polly-codegen < %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S '-passes=polly<no-default-opts>' < %s
 ;
 ; Verify we can handle a memset to "null" and that we do not model it.
 ; TODO: FIXME: We could use the undefined memset to optimize the code further,
diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll
index f825cbff1ec56..cd12421344f7f 100644
--- a/polly/test/ScopInfo/mismatching-array-dimensions.ll
+++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: AssumedContext
 
diff --git a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
index 6bc5f8d8eb73f..1e289425e86d7 100644
--- a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb -passes=polly-codegen -polly-allow-modref-calls \
-; RUN:     -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-allow-modref-calls -disable-output < %s
 ;
 ; Verify that we model the may-write access of the prefetch intrinsic
 ; correctly, thus that A is accessed by it but B is not.
diff --git a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
index 21322bc648f8e..0b6e64da437fd 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
-; RUN:     -polly-allow-modref-calls < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s
 ;
 ; Verify that we model the read access of the gcread intrinsic
 ; correctly, thus that A is read by it but B is not.
diff --git a/polly/test/ScopInfo/mod_ref_read_pointer.ll b/polly/test/ScopInfo/mod_ref_read_pointer.ll
index 25e56a08a961b..25d59d9f7fd16 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointer.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ; Check that we assume the call to func has a read on the whole A array.
 ;
diff --git a/polly/test/ScopInfo/mod_ref_read_pointers.ll b/polly/test/ScopInfo/mod_ref_read_pointers.ll
index 5cc96cf3a06eb..f8cbb084aefe8 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointers.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointers.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
-; RUN:     -polly-allow-modref-calls < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s
 ;
 ; Check that the call to func will "read" not only the A array but also the
 ; B array. The reason is the readonly annotation of func.
diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll
index 0a8957da4931a..a9b53d53aea7e 100644
--- a/polly/test/ScopInfo/modulo_zext_1.ll
+++ b/polly/test/ScopInfo/modulo_zext_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_2.ll b/polly/test/ScopInfo/modulo_zext_2.ll
index 7af2411e7e8c4..f86ddcea9fe2b 100644
--- a/polly/test/ScopInfo/modulo_zext_2.ll
+++ b/polly/test/ScopInfo/modulo_zext_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_3.ll b/polly/test/ScopInfo/modulo_zext_3.ll
index 1dac723aa2c23..21596d16a6e14 100644
--- a/polly/test/ScopInfo/modulo_zext_3.ll
+++ b/polly/test/ScopInfo/modulo_zext_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/multi-scop.ll b/polly/test/ScopInfo/multi-scop.ll
index c6dc1f201efa2..8647d89c91d7a 100644
--- a/polly/test/ScopInfo/multi-scop.ll
+++ b/polly/test/ScopInfo/multi-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>'  -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; This test case contains two scops.
diff --git a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
index bd46532d87f10..8785458e42f2c 100644
--- a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
+++ b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
index cdd46304c932b..5de07bad6bd06 100644
--- a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
+++ b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
index 0b735b9106189..984f41cd1e9bf 100644
--- a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
index befca87972c19..96b822ad4aa86 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
@@ -1,9 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
index cceb5353d74c0..c04cc200e06bd 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
@@ -1,9 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
index c957dd10ed652..2abd37c9f82d0 100644
--- a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
index 4a1ee3b1af51d..47cbc0bb1c534 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    #define N 400
 ;
diff --git a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
index 9a6d8fbe12755..e82869616d63c 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim.ll b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
index 9f47694022868..dde847bb8d4d7 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    struct com {
 ;      double Real;
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
index 5778126ad8f17..84222f73b7c6d 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -debug -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopInfo/multidim_fortran_2d.ll b/polly/test/ScopInfo/multidim_fortran_2d.ll
index e5b005f17dcc7..10314606a8123 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_params.ll b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
index a7f7ebc130362..992df969f9cc2 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_params.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-precise-fold-accesses \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-fold-accesses -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
index 5f3080a12fdbe..79fd4c286745e 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
@@ -1,9 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll
index 31cc633fa65c6..62ff184f7a6b6 100644
--- a/polly/test/ScopInfo/multidim_fortran_srem.ll
+++ b/polly/test/ScopInfo/multidim_fortran_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; CHECK:      Statements {
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast.ll b/polly/test/ScopInfo/multidim_gep_pointercast.ll
index fd8048b11f14b..aa7932fb737f0 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The load access to A has a pointer-bitcast to another elements size before the
 ; GetElementPtr. Verify that we do not the GEP delinearization because it
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
index 9daae4b1ce3db..0475506fa9f1a 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we do not use the GetElementPtr information to delinearize A
 ; because of the cast in-between. Use the single-dimensional modeling instead.
diff --git a/polly/test/ScopInfo/multidim_invalid_dimension.ll b/polly/test/ScopInfo/multidim_invalid_dimension.ll
index e1ec2e1ce3be0..1cf79f1bd8de1 100644
--- a/polly/test/ScopInfo/multidim_invalid_dimension.ll
+++ b/polly/test/ScopInfo/multidim_invalid_dimension.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnueabi"
 
diff --git a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
index 92b42a9e7a870..7779748c8c7f6 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
index 261cba1e68aad..49e0a9b60657b 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) {
diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll
index f0f1c2b1f39db..a4edc9e725ac4 100644
--- a/polly/test/ScopInfo/multidim_many_references.ll
+++ b/polly/test/ScopInfo/multidim_many_references.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_nested_start_integer.ll b/polly/test/ScopInfo/multidim_nested_start_integer.ll
index 6ee9798a050d7..c98aece41a9e1 100644
--- a/polly/test/ScopInfo/multidim_nested_start_integer.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_integer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
index e238bddf4783b..12c8d97f5d63b 100644
--- a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_2d.ll b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
index 33b321716edc3..a9685d12eb178 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_2d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d.ll b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
index 39ea4243d9426..bb9c302eaf06a 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
index 7f7f7f91067e2..7f0c8b12be9ba 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(int n, int m, int o, double A[n][m][o]) {
 ;
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
index 1675110ffd6f1..797a037a6770e 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; This test case checks for array access functions where the order in which the
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
index da9827fd5f2c6..3a21702b36727 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, long m, float A[][n][m]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript.ll b/polly/test/ScopInfo/multidim_param_in_subscript.ll
index c86b5f0ae2386..cc3fa87c8ba04 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void foo(long n, float A[][n]) {
diff --git a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
index da563a05560cd..117671ddc6a22 100644
--- a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
+++ b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A, long *p) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
index 7059e5396987b..5ebe0daaec470 100644
--- a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
+++ b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll
index 88c8c6af648e0..5c1b0ea7e6150 100644
--- a/polly/test/ScopInfo/multidim_srem.ll
+++ b/polly/test/ScopInfo/multidim_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, float A[][n][n]) {
 ;      for (long i = 0; i < 200; i++)
diff --git a/polly/test/ScopInfo/multidim_with_bitcast.ll b/polly/test/ScopInfo/multidim_with_bitcast.ll
index 0ab9c2d93ff46..941ec637dba3d 100644
--- a/polly/test/ScopInfo/multidim_with_bitcast.ll
+++ b/polly/test/ScopInfo/multidim_with_bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multiple-binary-or-conditions.ll b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
index 65416e6fffda3..ecfc0012fd59f 100644
--- a/polly/test/ScopInfo/multiple-binary-or-conditions.ll
+++ b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
index 910e624adb50a..9ae664fd497c8 100644
--- a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
+++ b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
-; RUN:     -polly-allow-differing-element-types \
-; RUN:     -disable-output < %s  2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    // For the following accesses the offset expression from the base pointer
 ;    // is not always a multiple of the type size.
diff --git a/polly/test/ScopInfo/multiple-types-non-affine-2.ll b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
index cb0630da1b2e6..6530dbf8d75be 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-affine.ll b/polly/test/ScopInfo/multiple-types-non-affine.ll
index 7349c5ae48ba2..7f5f995fd6d26 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
index df280c88f8668..5890a5a2ea3bf 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
index b9494187d0ff3..3e8390aad300f 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
index e971ccc0ba448..4e71f9b5dd66b 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
-; RUN:                -polly-allow-differing-element-types \
-; RUN:                -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void foo(long n, long m, char A[][m]) {
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional.ll b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
index 34179508cae89..9899fe4bde7ed 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
-; RUN:     -polly-allow-differing-element-types \
-; RUN:     -disable-output < %s  2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, long m, char A[][m]) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/ScopInfo/multiple-types.ll b/polly/test/ScopInfo/multiple-types.ll
index 84d7d3349e29d..753386575d33a 100644
--- a/polly/test/ScopInfo/multiple-types.ll
+++ b/polly/test/ScopInfo/multiple-types.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
-; RUN: -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks.ll b/polly/test/ScopInfo/multiple_exiting_blocks.ll
index b0c425ee62cc4..218e5c4108c90 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
index ff0ec47be1c58..d3a70fdb96130 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_latch_blocks.ll b/polly/test/ScopInfo/multiple_latch_blocks.ll
index e5085daa2ca16..0aa25f4ad70f6 100644
--- a/polly/test/ScopInfo/multiple_latch_blocks.ll
+++ b/polly/test/ScopInfo/multiple_latch_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   [N, P] -> { Stmt_if_end[i0] : 0 <= i0 < N and (i0 > P or i0 < P) };
diff --git a/polly/test/ScopInfo/nested-loops.ll b/polly/test/ScopInfo/nested-loops.ll
index 91002979f4fa4..7998a3896d9d4 100644
--- a/polly/test/ScopInfo/nested-loops.ll
+++ b/polly/test/ScopInfo/nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
index df010846bed20..f1ad40baf33ea 100644
--- a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
+++ b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not generate any scalar dependences regarding x. It is
 ; defined and used on the non-affine subregion only, thus we do not need
diff --git a/polly/test/ScopInfo/non-affine-region-phi.ll b/polly/test/ScopInfo/non-affine-region-phi.ll
index 3fb655e60f1c0..0248004c27f50 100644
--- a/polly/test/ScopInfo/non-affine-region-phi.ll
+++ b/polly/test/ScopInfo/non-affine-region-phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -S < %s 2>&1 | FileCheck %s --check-prefix=CODE
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s --check-prefix=CODE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify there is a phi in the non-affine region but it is not represented in
 ; the SCoP as all operands as well as the uses are inside the region too.
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
index 4c3ca4d21447d..158fe772c6d29 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Stmt_loop3
 ; CHECK:            Domain :=
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop.ll b/polly/test/ScopInfo/non-affine-region-with-loop.ll
index f4c028ac23409..bcb542f2cbf70 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -passes=polly-codegen -disable-output
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly<no-default-opts>' -disable-output
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_loop2__TO__loop[] };
diff --git a/polly/test/ScopInfo/non-precise-inv-load-1.ll b/polly/test/ScopInfo/non-precise-inv-load-1.ll
index d55344b355f13..d100b514a0be3 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-1.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do hoist the invariant access to I with a execution context
 ; as the address computation might wrap in the original but not in our
diff --git a/polly/test/ScopInfo/non-precise-inv-load-2.ll b/polly/test/ScopInfo/non-precise-inv-load-2.ll
index 79ef3b88cb4f0..fad8fcd918446 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-2.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ; CHECK:       Invariant Accesses: {
diff --git a/polly/test/ScopInfo/non-precise-inv-load-3.ll b/polly/test/ScopInfo/non-precise-inv-load-3.ll
index aa92847661165..d032644c9e5ff 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-3.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Invariant Accesses: {
 ; CHECK-NEXT:     ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/non-precise-inv-load-4.ll b/polly/test/ScopInfo/non-precise-inv-load-4.ll
index 2a2241cb5a993..c1ba7ddc62584 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-4.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we hoist I[0] without execution context even though it
 ; is executed in a statement with an invalid domain.
diff --git a/polly/test/ScopInfo/non-precise-inv-load-5.ll b/polly/test/ScopInfo/non-precise-inv-load-5.ll
index a414c7c0fed17..c188b5f74b1e9 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-5.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not hoist I[c] without execution context because it
 ; is executed in a statement with an invalid domain and it depends
diff --git a/polly/test/ScopInfo/non-precise-inv-load-6.ll b/polly/test/ScopInfo/non-precise-inv-load-6.ll
index 1300617f00eeb..b1c19745f1424 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-6.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we model the execution context correctly.
 ;
diff --git a/polly/test/ScopInfo/non-pure-function-call.ll b/polly/test/ScopInfo/non-pure-function-call.ll
index 81d43db5c3522..ad69141a12c66 100644
--- a/polly/test/ScopInfo/non-pure-function-call.ll
+++ b/polly/test/ScopInfo/non-pure-function-call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT:   [N] -> {  :  }
diff --git a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
index 6cbb41041be88..38e1c03a35227 100644
--- a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Error blocks are skipped during SCoP detection. We skip them during
 ; SCoP formation too as they might contain instructions we can not handle.
diff --git a/polly/test/ScopInfo/non-pure-function-calls.ll b/polly/test/ScopInfo/non-pure-function-calls.ll
index f97644052272d..d45c32ede7088 100644
--- a/polly/test/ScopInfo/non-pure-function-calls.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Allow the user to define function names that are treated as
 ; error functions and assumed not to be executed.
diff --git a/polly/test/ScopInfo/non_affine_access.ll b/polly/test/ScopInfo/non_affine_access.ll
index 0338edf053297..0f5d9e7c43e4e 100644
--- a/polly/test/ScopInfo/non_affine_access.ll
+++ b/polly/test/ScopInfo/non_affine_access.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long *A) {
diff --git a/polly/test/ScopInfo/non_affine_region_1.ll b/polly/test/ScopInfo/non_affine_region_1.ll
index 8980a711b325d..5934962f81567 100644
--- a/polly/test/ScopInfo/non_affine_region_1.ll
+++ b/polly/test/ScopInfo/non_affine_region_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify only the incoming scalar x is modeled as a read in the non-affine
 ; region.
diff --git a/polly/test/ScopInfo/non_affine_region_2.ll b/polly/test/ScopInfo/non_affine_region_2.ll
index b2e072f7a3bfa..aa083616cac8e 100644
--- a/polly/test/ScopInfo/non_affine_region_2.ll
+++ b/polly/test/ScopInfo/non_affine_region_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_3.ll b/polly/test/ScopInfo/non_affine_region_3.ll
index d850cb5c95aad..b7c4c1b9bd545 100644
--- a/polly/test/ScopInfo/non_affine_region_3.ll
+++ b/polly/test/ScopInfo/non_affine_region_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_4.ll b/polly/test/ScopInfo/non_affine_region_4.ll
index c5309734a668e..12cda0a53fb3b 100644
--- a/polly/test/ScopInfo/non_affine_region_4.ll
+++ b/polly/test/ScopInfo/non_affine_region_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that both scalars (x and y) are properly written in the non-affine
 ; region and read afterwards.
diff --git a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
index b1ce00f0df94e..a52aae0d59168 100644
--- a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
+++ b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_while_cond_i__TO__while_end_i[] };
diff --git a/polly/test/ScopInfo/not-a-reduction.ll b/polly/test/ScopInfo/not-a-reduction.ll
index 3a961b2dc1719..84f6564ae4a2e 100644
--- a/polly/test/ScopInfo/not-a-reduction.ll
+++ b/polly/test/ScopInfo/not-a-reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s
 
 ;#define TYPE float
 ;#define NUM 4
diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll
index f4f79525069e5..23b9d3caf741d 100644
--- a/polly/test/ScopInfo/opaque-struct.ll
+++ b/polly/test/ScopInfo/opaque-struct.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; Check that we do not crash with unsized (opaque) types.
 ;
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
index eed27b1c4d9dd..e069ccac55340 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s
 ;
 ; Check whether %newval is identified as escaping value, even though it is used
 ; in a phi that is in the region. Non-affine subregion case.
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
index 44da399e704d8..27ea11a23a3fe 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 1]
 ; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] };
diff --git a/polly/test/ScopInfo/parameter-constant-division.ll b/polly/test/ScopInfo/parameter-constant-division.ll
index e5dd359158b8b..aaad0dfb2ee60 100644
--- a/polly/test/ScopInfo/parameter-constant-division.ll
+++ b/polly/test/ScopInfo/parameter-constant-division.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
-; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/parameter_in_dead_statement.ll b/polly/test/ScopInfo/parameter_in_dead_statement.ll
index b295f17f628af..444f9a9c24b4e 100644
--- a/polly/test/ScopInfo/parameter_in_dead_statement.ll
+++ b/polly/test/ScopInfo/parameter_in_dead_statement.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; Verify we do not create assumptions based on the parameter p_1 which is the
 ; load %0 and due to error-assumptions not "part of the SCoP".
diff --git a/polly/test/ScopInfo/parameter_product.ll b/polly/test/ScopInfo/parameter_product.ll
index 2fe16f9d95f6d..9e6e3d0e1446e 100644
--- a/polly/test/ScopInfo/parameter_product.ll
+++ b/polly/test/ScopInfo/parameter_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int n, m;
 ; void foo(char* __restrict a)
diff --git a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
index 6544aaec76f74..20986d17b8f0d 100644
--- a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
+++ b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the access function of the store is simple and concise
 ;
diff --git a/polly/test/ScopInfo/partially_invariant_load_1.ll b/polly/test/ScopInfo/partially_invariant_load_1.ll
index f3923f6127cdd..8d62f156a4394 100644
--- a/polly/test/ScopInfo/partially_invariant_load_1.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:             ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/partially_invariant_load_2.ll b/polly/test/ScopInfo/partially_invariant_load_2.ll
index d0d74ad99e09b..48580907b2f0b 100644
--- a/polly/test/ScopInfo/partially_invariant_load_2.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not try to preload *I and assume p != 42.
 ;
diff --git a/polly/test/ScopInfo/phi-in-non-affine-region.ll b/polly/test/ScopInfo/phi-in-non-affine-region.ll
index fbbc158b566bb..6d98a6813862e 100644
--- a/polly/test/ScopInfo/phi-in-non-affine-region.ll
+++ b/polly/test/ScopInfo/phi-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that 'tmp' is stored in bb1 and read by bb3, as it is needed as
 ; incoming value for the tmp11 PHI node.
diff --git a/polly/test/ScopInfo/phi_after_error_block.ll b/polly/test/ScopInfo/phi_after_error_block.ll
index a1eadff3e9717..251be099c1f49 100644
--- a/polly/test/ScopInfo/phi_after_error_block.ll
+++ b/polly/test/ScopInfo/phi_after_error_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @bar()
 
diff --git a/polly/test/ScopInfo/phi_condition_modeling_1.ll b/polly/test/ScopInfo/phi_condition_modeling_1.ll
index a889ec96a4b12..bd5c51e968ff5 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_1.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_condition_modeling_2.ll b/polly/test/ScopInfo/phi_condition_modeling_2.ll
index b56b77e1f4534..281b8d33b7756 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_2.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_conditional_simple_1.ll b/polly/test/ScopInfo/phi_conditional_simple_1.ll
index 14fdc38201bc8..6d7f0e9484113 100644
--- a/polly/test/ScopInfo/phi_conditional_simple_1.ll
+++ b/polly/test/ScopInfo/phi_conditional_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/phi_loop_carried_float.ll b/polly/test/ScopInfo/phi_loop_carried_float.ll
index 76e5507f24b06..2e62dcd5799a3 100644
--- a/polly/test/ScopInfo/phi_loop_carried_float.ll
+++ b/polly/test/ScopInfo/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/ScopInfo/phi_not_grouped_at_top.ll b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
index c97d9a27b24b7..57d02f24f781b 100644
--- a/polly/test/ScopInfo/phi_not_grouped_at_top.ll
+++ b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-prepare -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @funa() align 2
diff --git a/polly/test/ScopInfo/phi_scalar_simple_1.ll b/polly/test/ScopInfo/phi_scalar_simple_1.ll
index ffd1a37f8a79f..600c94e1d9b4c 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_1.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context should be empty since the <nsw> flags on the IV
 ; increments already guarantee that there is no wrap in the loop trip
diff --git a/polly/test/ScopInfo/phi_scalar_simple_2.ll b/polly/test/ScopInfo/phi_scalar_simple_2.ll
index 0d6d9029c61c3..d3353ddc5e4e8 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_2.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/phi_with_invoke_edge.ll b/polly/test/ScopInfo/phi_with_invoke_edge.ll
index 9c98ec0c603cf..1b01a98fca06a 100644
--- a/polly/test/ScopInfo/phi_with_invoke_edge.ll
+++ b/polly/test/ScopInfo/phi_with_invoke_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @generic_personality_v0(i32, i64, ptr, ptr)
diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
index 18ba18c69f1f9..1b983ace1b6a4 100644
--- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
+++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B) {
 ;      while (A != B) {
diff --git a/polly/test/ScopInfo/pointer-comparison.ll b/polly/test/ScopInfo/pointer-comparison.ll
index 846640ac630ff..f80c4978669c4 100644
--- a/polly/test/ScopInfo/pointer-comparison.ll
+++ b/polly/test/ScopInfo/pointer-comparison.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: FIXME: Investigate why we need a InvalidContext here.
 ;
diff --git a/polly/test/ScopInfo/pointer-type-expressions.ll b/polly/test/ScopInfo/pointer-type-expressions.ll
index 89dce6536a107..0fdd0bea6f219 100644
--- a/polly/test/ScopInfo/pointer-type-expressions.ll
+++ b/polly/test/ScopInfo/pointer-type-expressions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
index 7b6d0d542581b..8ad531d93d290 100644
--- a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
+++ b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; In this test case we pass a pointer %A into a PHI node and also use this
 ; pointer as base pointer of an array store. As a result, we get both scalar
diff --git a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
index 13087a517501a..7dfa1ec7905ba 100644
--- a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
+++ b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:  	Stmt_bb9
diff --git a/polly/test/ScopInfo/pr38218.ll b/polly/test/ScopInfo/pr38218.ll
index 74103f9a2ac38..2c22b1464876d 100644
--- a/polly/test/ScopInfo/pr38218.ll
+++ b/polly/test/ScopInfo/pr38218.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; This code causes the SCoP to be rejected because of an ERRORBLOCK
 ; assumption and made Polly crash (llvm.org/PR38219).
diff --git a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
index 33fa0126aa30e..800b0339a1422 100644
--- a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
+++ b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/process_added_dimensions.ll b/polly/test/ScopInfo/process_added_dimensions.ll
index 2d06f4b995976..9cb932eeef18a 100644
--- a/polly/test/ScopInfo/process_added_dimensions.ll
+++ b/polly/test/ScopInfo/process_added_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/pwaff-complexity-bailout.ll b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
index 931e08fb8f2fc..62909f8c3e4c5 100644
--- a/polly/test/ScopInfo/pwaff-complexity-bailout.ll
+++ b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops '-pass-remarks-analysis=.*' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Make sure we hit the complexity bailout, and don't crash.
 ; CHECK: Low complexity assumption:       {  : false }
diff --git a/polly/test/ScopInfo/ranged_parameter.ll b/polly/test/ScopInfo/ranged_parameter.ll
index 03562b1fd1245..a6e51c7f2048c 100644
--- a/polly/test/ScopInfo/ranged_parameter.ll
+++ b/polly/test/ScopInfo/ranged_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_2.ll b/polly/test/ScopInfo/ranged_parameter_2.ll
index 18cbbf3b87cd6..554dd6e38cd00 100644
--- a/polly/test/ScopInfo/ranged_parameter_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_2.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \
-; RUN:  -debug 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true -debug < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap.ll b/polly/test/ScopInfo/ranged_parameter_wrap.ll
index d236eeeefc11c..7ae15c34c94c6 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; __wrapping__ range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
index fc0a737a5edbe..00c3caa9c50ce 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the context is built fast and does not explode due to us
 ; combining a large number of non-convex ranges. Instead, after a certain
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
index 7e6f2406a0ac8..528dbb102ecb0 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
index 18e6c1fac9e15..6bc1fe71f35f2 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalars.ll b/polly/test/ScopInfo/read-only-scalars.ll
index f04163e480284..7c78d621930c5 100644
--- a/polly/test/ScopInfo/read-only-scalars.ll
+++ b/polly/test/ScopInfo/read-only-scalars.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS
 
 ; CHECK-NOT: Memref_scalar
 
diff --git a/polly/test/ScopInfo/read-only-statements.ll b/polly/test/ScopInfo/read-only-statements.ll
index 7bac53a2b6b51..c1cb618a45f64 100644
--- a/polly/test/ScopInfo/read-only-statements.ll
+++ b/polly/test/ScopInfo/read-only-statements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check we remove read only statements.
 ;
diff --git a/polly/test/ScopInfo/reduction_alternating_base.ll b/polly/test/ScopInfo/reduction_alternating_base.ll
index e38ff6046ac01..474c6ac64ffc1 100644
--- a/polly/test/ScopInfo/reduction_alternating_base.ll
+++ b/polly/test/ScopInfo/reduction_alternating_base.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
index 17f9dc57f2823..e91eeaf544a05 100644
--- a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
+++ b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/reduction_different_index.ll b/polly/test/ScopInfo/reduction_different_index.ll
index d2786d5fd6779..5c169f71f4fe8 100644
--- a/polly/test/ScopInfo/reduction_different_index.ll
+++ b/polly/test/ScopInfo/reduction_different_index.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A,int *sum) {
diff --git a/polly/test/ScopInfo/reduction_different_index1.ll b/polly/test/ScopInfo/reduction_different_index1.ll
index 710ae3e74f21a..93ab77be84de9 100644
--- a/polly/test/ScopInfo/reduction_different_index1.ll
+++ b/polly/test/ScopInfo/reduction_different_index1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A, int *sum, int i1, int i2) {
diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
index 61228e075dabe..618e4d3ab3f98 100644
--- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
+++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: +
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_sum[0] };
diff --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll
index d126d3d833ee1..a7721d1b42e46 100644
--- a/polly/test/ScopInfo/reduction_double.ll
+++ b/polly/test/ScopInfo/reduction_double.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
 ;
 ; Verify if two independent reductions in same loop is detected
 ;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate.ll b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
index c66a8be0852fa..86923458ee773 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
index c574d315b2fe1..641d2e7337e77 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
index 92a071ea1c372..dd2a76ebbd368 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll
index 4f7d3681e0a0b..53a62a3b857e9 100644
--- a/polly/test/ScopInfo/reduction_if.ll
+++ b/polly/test/ScopInfo/reduction_if.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
 ;
 ; Verify if reduction spread across multiple blocks in a single scop statement are detected
 ;
diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll
index 7acac4b150f40..cb54cd9581368 100644
--- a/polly/test/ScopInfo/reduction_indirect_access.ll
+++ b/polly/test/ScopInfo/reduction_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ; CHECK: MemRef_INDICES[i0]
diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll
index 331953991d86c..5642a8470f124 100644
--- a/polly/test/ScopInfo/reduction_indirect_access_2.ll
+++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
 ;
 ; Validate that the accesses to INDICES[i] is not part of a reduction.
 ;
diff --git a/polly/test/ScopInfo/reduction_invalid_different_operators.ll b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
index 9846f1029c087..9e6b3cd431083 100644
--- a/polly/test/ScopInfo/reduction_invalid_different_operators.ll
+++ b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int f() {
 ;   int i, sum = 0, sth = 0;
diff --git a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
index 4d70e53304556..7ae7d8ed3ffa2 100644
--- a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
+++ b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int *sums) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
index 62ae1fef187b6..6f2f48005bdac 100644
--- a/polly/test/ScopInfo/reduction_long_reduction_chain.ll
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ; CHECK: MemRef_sum
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
index 7ca46fa9535ac..2fd71c28d5211 100644
--- a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; Sum is added twice in the statement. Hence no reduction.
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
index b77c72a291744..4f049a3505b09 100644
--- a/polly/test/ScopInfo/reduction_multiple_different_operators.ll
+++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; Should not be identified as reduction as there are different operations
 ; involved on sum (multiplication followed by addition)
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
index 800eb2043dc62..0d016674ffc08 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: *
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
index 49ebdcb044988..568513aedfa10 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
index 77b71f4df301b..0ac50b3b92c47 100644
--- a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
+++ b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: NONE
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_A[1 + i0] };
diff --git a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
index 61aaa051e49d1..f01b641b17f64 100644
--- a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
+++ b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ; CHECK: Reduction Type: +
diff --git a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
index fb6d236764b74..51685dca8b7da 100644
--- a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_fp.ll b/polly/test/ScopInfo/reduction_simple_fp.ll
index aa4cd00f39f59..67139bba2fded 100644
--- a/polly/test/ScopInfo/reduction_simple_fp.ll
+++ b/polly/test/ScopInfo/reduction_simple_fp.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Function: f_no_fast_math
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_simple_w_constant.ll b/polly/test/ScopInfo/reduction_simple_w_constant.ll
index e385b66f9db21..c17184624c066 100644
--- a/polly/test/ScopInfo/reduction_simple_w_constant.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_w_iv.ll b/polly/test/ScopInfo/reduction_simple_w_iv.ll
index e22eccbb2831d..7cc50bfe78906 100644
--- a/polly/test/ScopInfo/reduction_simple_w_iv.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_two_identical_reads.ll b/polly/test/ScopInfo/reduction_two_identical_reads.ll
index 8f00954f7efc3..35cb9dfcdb122 100644
--- a/polly/test/ScopInfo/reduction_two_identical_reads.ll
+++ b/polly/test/ScopInfo/reduction_two_identical_reads.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll
index ad71f1f59e18b..7512da420af0e 100644
--- a/polly/test/ScopInfo/redundant_parameter_constraint.ll
+++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The constraint that r2 has to be bigger than r1 is implicitly contained in
 ; the domain, hence we do not want to see it explicitly.
diff --git a/polly/test/ScopInfo/region-with-instructions.ll b/polly/test/ScopInfo/region-with-instructions.ll
index d4720511b7aad..38d58c97e1b05 100644
--- a/polly/test/ScopInfo/region-with-instructions.ll
+++ b/polly/test/ScopInfo/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Statements {
 ; CHECK: 	Stmt_bb46
diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll
index 10cc57aa27a14..2d6ace988659d 100644
--- a/polly/test/ScopInfo/remarks.ll
+++ b/polly/test/ScopInfo/remarks.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: remark: test/ScopInfo/remarks.c:4:7: SCoP begins here.
 ; CHECK: remark: test/ScopInfo/remarks.c:9:15: Inbounds assumption:    [N, M, Debug] -> {  : M <= 100 }
diff --git a/polly/test/ScopInfo/required-invariant-loop-bounds.ll b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
index abf0b0e23855c..3bb5bfb0765e3 100644
--- a/polly/test/ScopInfo/required-invariant-loop-bounds.ll
+++ b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:       ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/restriction_in_dead_block.ll b/polly/test/ScopInfo/restriction_in_dead_block.ll
index 487c585cb9d9c..dd6115c421d0c 100644
--- a/polly/test/ScopInfo/restriction_in_dead_block.ll
+++ b/polly/test/ScopInfo/restriction_in_dead_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not generate an empty invalid context only because the wrap
 ; in the second conditional will always happen if the block is executed.
diff --git a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
index 702b7dc5e0049..e8df1eccd5945 100644
--- a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
+++ b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
@@ -1,6 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: bb124 => bb176
 ;
diff --git a/polly/test/ScopInfo/run-time-check-many-parameters.ll b/polly/test/ScopInfo/run-time-check-many-parameters.ll
index 559c38d2682ef..2a8853322f1d5 100644
--- a/polly/test/ScopInfo/run-time-check-many-parameters.ll
+++ b/polly/test/ScopInfo/run-time-check-many-parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; A valid Scop would print the list of it's statements, we check that we do not
 ; see that list.
diff --git a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
index 3cf4c40bdb60f..5e71e7a9d2a46 100644
--- a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
+++ b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
@@ -1,6 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
-; RUN: | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: for => return
 ;
diff --git a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
index 51ab81476d542..286f878f935f4 100644
--- a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
+++ b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void foo(float *A, float *B, float *C, long N) {
 ; 	for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
index dd809ba156c79..9f4d6f5895aeb 100644
--- a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
+++ b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we introduce two ScopArrayInfo objects (or virtual arrays) for the %out variable
 ; as it is used as a memory base pointer (%0) but also as a scalar (%out.addr.0.lcssa).
diff --git a/polly/test/ScopInfo/scalar.ll b/polly/test/ScopInfo/scalar.ll
index 812d2fddc3c8e..db8371d96b118 100644
--- a/polly/test/ScopInfo/scalar.ll
+++ b/polly/test/ScopInfo/scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/scalar_dependence_cond_br.ll b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
index 59549f3dbbad5..a09bdaf06844e 100644
--- a/polly/test/ScopInfo/scalar_dependence_cond_br.ll
+++ b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int d) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll
index 3f61d0d723046..e71c515fa2d35 100644
--- a/polly/test/ScopInfo/scalar_to_array.ll
+++ b/polly/test/ScopInfo/scalar_to_array.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; ModuleID = 'scalar_to_array.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index fa0c81fe9a48e..66c50dcbe13f3 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
 
diff --git a/polly/test/ScopInfo/scev-invalidated.ll b/polly/test/ScopInfo/scev-invalidated.ll
index 6b9efd4b37c7d..e0956df0b1e84 100644
--- a/polly/test/ScopInfo/scev-invalidated.ll
+++ b/polly/test/ScopInfo/scev-invalidated.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %if.then6---%return
 ;
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
index 6e2ed1240b071..4a280cc929e3a 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
index d0e8a2accaa2c..777c0088c4ddd 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
index 9ffc30f7360e9..15dea5a7f4dd8 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
index 65f2f99b48c1b..9ac6643564f7b 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
index 7c36f8d7f72e8..1657d2f37d8ba 100644
--- a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
+++ b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
 ;
 ; This test contains a infinite loop (bb13) and crashed the domain generation
 ; at some point. Just verify it does not anymore.
diff --git a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
index c8a234e9cbce7..76bb438d43ff7 100644
--- a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
+++ b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-android"
 
diff --git a/polly/test/ScopInfo/sign_wrapped_set.ll b/polly/test/ScopInfo/sign_wrapped_set.ll
index 93b63df1c5841..135976e7d51c6 100644
--- a/polly/test/ScopInfo/sign_wrapped_set.ll
+++ b/polly/test/ScopInfo/sign_wrapped_set.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Domain :=
 ; CHECK-NEXT:       [srcHeight] -> { Stmt_for_cond6_preheader_us[i0] : 0 <= i0 <= -3 + srcHeight };
diff --git a/polly/test/ScopInfo/simple_loop_1.ll b/polly/test/ScopInfo/simple_loop_1.ll
index e736f3382d905..1d9f5c2edebcb 100644
--- a/polly/test/ScopInfo/simple_loop_1.ll
+++ b/polly/test/ScopInfo/simple_loop_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_2.ll b/polly/test/ScopInfo/simple_loop_2.ll
index ae83dd633b96e..877f860ba5a90 100644
--- a/polly/test/ScopInfo/simple_loop_2.ll
+++ b/polly/test/ScopInfo/simple_loop_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned.ll b/polly/test/ScopInfo/simple_loop_unsigned.ll
index c4a96e4381c94..d3834297e2668 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], unsigned N) {
 ;   unsigned i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_2.ll b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
index 37e907dc006f3..1da6053a8316b 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_2.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> {  :  }
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_3.ll b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
index 7f2cf5caa1ce7..0d44bf64ffc18 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_3.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> { : }
diff --git a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
index 4df0d343b0fc9..f70b3fa3ea21a 100644
--- a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
+++ b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @.str = private unnamed_addr constant [17 x i8] c"Random Value: %d\00", align 1
diff --git a/polly/test/ScopInfo/smax.ll b/polly/test/ScopInfo/smax.ll
index 8968e13192477..3ba2b35e7e503 100644
--- a/polly/test/ScopInfo/smax.ll
+++ b/polly/test/ScopInfo/smax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
 
 define void @foo(ptr noalias %data, ptr noalias %ptr, i32 %x_pos, i32 %w) {
diff --git a/polly/test/ScopInfo/statistics.ll b/polly/test/ScopInfo/statistics.ll
index 0a294f2016eba..aa72db3065259 100644
--- a/polly/test/ScopInfo/statistics.ll
+++ b/polly/test/ScopInfo/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -stats -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-DAG:  4 polly-scops      - Maximal number of loops in scops
diff --git a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
index a46acb090b7fd..54832607f11d5 100644
--- a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Region__TO__Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_after_split.ll b/polly/test/ScopInfo/stmt_split_no_after_split.ll
index 3a5ebf0725b10..0a4284bdd34f5 100644
--- a/polly/test/ScopInfo/stmt_split_no_after_split.ll
+++ b/polly/test/ScopInfo/stmt_split_no_after_split.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT: 	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_dependence.ll b/polly/test/ScopInfo/stmt_split_no_dependence.ll
index 9edd0f0a13e59..ed2180407c68d 100644
--- a/polly/test/ScopInfo/stmt_split_no_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_store.ll b/polly/test/ScopInfo/stmt_split_on_store.ll
index d645becb19583..f35a07c8d7176 100644
--- a/polly/test/ScopInfo/stmt_split_on_store.ll
+++ b/polly/test/ScopInfo/stmt_split_on_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
index 1a1ccff4f02d6..41721867f1764 100644
--- a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
+++ b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
index 594b36279d6bc..0521525e272b3 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
index 6c9f1c2cb5fd0..82a85aa5f0099 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
index 07abe46ac0399..1f21c0ce7225f 100644
--- a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_within_loop.ll b/polly/test/ScopInfo/stmt_split_within_loop.ll
index 9a42ae3a37270..580ffab567846 100644
--- a/polly/test/ScopInfo/stmt_split_within_loop.ll
+++ b/polly/test/ScopInfo/stmt_split_within_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
index ba4801d9a0006..67e8f631312ea 100644
--- a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
+++ b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The statement Stmt_for_if_else_1 should be removed because it has no
 ; sideeffects.  But it has a use of MemRef_tmp21 that must also be
diff --git a/polly/test/ScopInfo/switch-1.ll b/polly/test/ScopInfo/switch-1.ll
index 0c3610185e6e0..0f9e83210661b 100644
--- a/polly/test/ScopInfo/switch-1.ll
+++ b/polly/test/ScopInfo/switch-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-2.ll b/polly/test/ScopInfo/switch-2.ll
index f0056da37955d..9defd41f25231 100644
--- a/polly/test/ScopInfo/switch-2.ll
+++ b/polly/test/ScopInfo/switch-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-3.ll b/polly/test/ScopInfo/switch-3.ll
index a1810bf6ef538..faaa4d0254db9 100644
--- a/polly/test/ScopInfo/switch-3.ll
+++ b/polly/test/ScopInfo/switch-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll
index 00665fd75cbcd..c82e703a82965 100644
--- a/polly/test/ScopInfo/switch-4.ll
+++ b/polly/test/ScopInfo/switch-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-5.ll b/polly/test/ScopInfo/switch-5.ll
index 2de3695649404..5a49be8d80975 100644
--- a/polly/test/ScopInfo/switch-5.ll
+++ b/polly/test/ScopInfo/switch-5.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/switch-6.ll b/polly/test/ScopInfo/switch-6.ll
index b859840ee111f..379981b167039 100644
--- a/polly/test/ScopInfo/switch-6.ll
+++ b/polly/test/ScopInfo/switch-6.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/switch-7.ll b/polly/test/ScopInfo/switch-7.ll
index f73d97f70b28d..0c8efc590b9c9 100644
--- a/polly/test/ScopInfo/switch-7.ll
+++ b/polly/test/ScopInfo/switch-7.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int c, int N) {
 ;      switch (c) {
diff --git a/polly/test/ScopInfo/tempscop-printing.ll b/polly/test/ScopInfo/tempscop-printing.ll
index 4f02176569b73..09cc95e42a584 100644
--- a/polly/test/ScopInfo/tempscop-printing.ll
+++ b/polly/test/ScopInfo/tempscop-printing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/test-wrapping-in-condition.ll b/polly/test/ScopInfo/test-wrapping-in-condition.ll
index 746350422d6b9..d64bdf985c1d2 100644
--- a/polly/test/ScopInfo/test-wrapping-in-condition.ll
+++ b/polly/test/ScopInfo/test-wrapping-in-condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/truncate-1.ll b/polly/test/ScopInfo/truncate-1.ll
index 44222c88dfa77..d531dd8e5ab08 100644
--- a/polly/test/ScopInfo/truncate-1.ll
+++ b/polly/test/ScopInfo/truncate-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (char i = 0; i < (char)N; i++)
diff --git a/polly/test/ScopInfo/truncate-2.ll b/polly/test/ScopInfo/truncate-2.ll
index c78a5337fdeba..3f5d1faf4c377 100644
--- a/polly/test/ScopInfo/truncate-2.ll
+++ b/polly/test/ScopInfo/truncate-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (short i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/truncate-3.ll b/polly/test/ScopInfo/truncate-3.ll
index 5a80a873cd476..d20f375b9a2bd 100644
--- a/polly/test/ScopInfo/truncate-3.ll
+++ b/polly/test/ScopInfo/truncate-3.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
-; RUN:                -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Signed-unsigned restriction: [p] -> {  : p <= -129 or p >= 128 }
 
diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll
index e2723a8a9a2e9..aa2be1003adcc 100644
--- a/polly/test/ScopInfo/two-loops-one-infinite.ll
+++ b/polly/test/ScopInfo/two-loops-one-infinite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not create a SCoP in the presence of infinite loops.
 ;
diff --git a/polly/test/ScopInfo/two-loops-right-after-each-other.ll b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
index 51f3c2d6eb875..163642d9072e2 100644
--- a/polly/test/ScopInfo/two-loops-right-after-each-other.ll
+++ b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop_1
diff --git a/polly/test/ScopInfo/undef_in_cond.ll b/polly/test/ScopInfo/undef_in_cond.ll
index ef117612f6cb3..5fb08f82b3267 100644
--- a/polly/test/ScopInfo/undef_in_cond.ll
+++ b/polly/test/ScopInfo/undef_in_cond.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @fix_operands() nounwind {
diff --git a/polly/test/ScopInfo/unnamed_nonaffine.ll b/polly/test/ScopInfo/unnamed_nonaffine.ll
index 5b9f980591777..11418499702df 100644
--- a/polly/test/ScopInfo/unnamed_nonaffine.ll
+++ b/polly/test/ScopInfo/unnamed_nonaffine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED
 ;
 ;    void f(int *A, int b) {
 ;      int x;
diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll
index 163170ce74895..e23b3ae5404b5 100644
--- a/polly/test/ScopInfo/unnamed_stmts.ll
+++ b/polly/test/ScopInfo/unnamed_stmts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that we generate numbered statement names in case
 ; no LLVM-IR names are used in the test case. We also verify, that we
diff --git a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
index daa1f8c783870..5bc136658ccab 100644
--- a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
+++ b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ; Derived from test-suite/MultiSource/Applications/sgefa/blas.c
 ;
 ; The exit value of %i.0320 in land.rhs is not computable.
diff --git a/polly/test/ScopInfo/unprofitable_scalar-accs.ll b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
index ca8daa4de01a6..3f6bb937ded1a 100644
--- a/polly/test/ScopInfo/unprofitable_scalar-accs.ll
+++ b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true  '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC
 
 ; Check the effect of -polly-unprofitable-scalar-accs
 
diff --git a/polly/test/ScopInfo/unsigned-condition.ll b/polly/test/ScopInfo/unsigned-condition.ll
index 0529ded1f6cfb..608b6d6e50a36 100644
--- a/polly/test/ScopInfo/unsigned-condition.ll
+++ b/polly/test/ScopInfo/unsigned-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, unsigned P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/unsigned-division-1.ll b/polly/test/ScopInfo/unsigned-division-1.ll
index 1c06b55300b67..58d39dc239ac9 100644
--- a/polly/test/ScopInfo/unsigned-division-1.ll
+++ b/polly/test/ScopInfo/unsigned-division-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-2.ll b/polly/test/ScopInfo/unsigned-division-2.ll
index 153639c42b384..cda666d6f5ebf 100644
--- a/polly/test/ScopInfo/unsigned-division-2.ll
+++ b/polly/test/ScopInfo/unsigned-division-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2 + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-3.ll b/polly/test/ScopInfo/unsigned-division-3.ll
index 34561fc4645cc..50de3c59892e7 100644
--- a/polly/test/ScopInfo/unsigned-division-3.ll
+++ b/polly/test/ScopInfo/unsigned-division-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i <= N / -128; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-4.ll b/polly/test/ScopInfo/unsigned-division-4.ll
index be539b47123bc..4dd75e526407d 100644
--- a/polly/test/ScopInfo/unsigned-division-4.ll
+++ b/polly/test/ScopInfo/unsigned-division-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i < (N / -128) + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-5.ll b/polly/test/ScopInfo/unsigned-division-5.ll
index 61716ecec0d90..fff131292271a 100644
--- a/polly/test/ScopInfo/unsigned-division-5.ll
+++ b/polly/test/ScopInfo/unsigned-division-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/unsigned_wrap_uge.ll b/polly/test/ScopInfo/unsigned_wrap_uge.ll
index d25a9576e863a..f54b9bec6e7df 100644
--- a/polly/test/ScopInfo/unsigned_wrap_uge.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_uge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ugt.ll b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
index 0310fdde6d26e..20afd17f86793 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ugt.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ule.ll b/polly/test/ScopInfo/unsigned_wrap_ule.ll
index 47bfc6065b1a8..6fa6cc12990a3 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ule.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ult.ll b/polly/test/ScopInfo/unsigned_wrap_ult.ll
index 1b73c0d6dd7ee..4a3b604d81f0f 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ult.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ult.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/user_context.ll b/polly/test/ScopInfo/user_context.ll
index 74088120e4015..ce8dd921cec16 100644
--- a/polly/test/ScopInfo/user_context.ll
+++ b/polly/test/ScopInfo/user_context.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly                                      '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-context='[N] -> {: N = 1024}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX
-; RUN: opt %loadNPMPolly -polly-context='[N,M] -> {: 1 = 0}'  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-context='[] -> {: 1 = 0}'     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-polly-context=[N] -> {: N = 1024}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX
+; RUN: opt %loadNPMPolly '-polly-context=[N,M] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-polly-context=[] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
index bd13ba8bb6961..c35ed9060e504 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK: remark: <unknown>:0:0: Use user assumption: [n, b] -> {  : n <= 100 or (b = 0 and n >= 101) }
 ;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
index 45f59170942ed..2afe99fd2c53b 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Context:
 ; CHECK-NEXT: [n] -> {  : -9223372036854775808 <= n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
index fb71c75aa75e4..3479558062671 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK:      remark: <unknown>:0:0: SCoP begins here.
 ; REMARK-NEXT: remark: <unknown>:0:0: Use user assumption: [n] -> {  : n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions.ll b/polly/test/ScopInfo/user_provided_assumptions.ll
index 49b23b1e784dc..0bd99ea3fcb35 100644
--- a/polly/test/ScopInfo/user_provided_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [M, N] -> {  : N <= 2147483647 - M }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_2.ll b/polly/test/ScopInfo/user_provided_assumptions_2.ll
index f8643b68cc63f..1499ab98f7369 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_2.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: { : }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_3.ll b/polly/test/ScopInfo/user_provided_assumptions_3.ll
index 70f8f359e16cd..aa1f72dddde9d 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_3.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [N] -> { : N >= 2 }
diff --git a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
index 3e7883db48fcb..a6eed5df2063e 100644
--- a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [i, N, M] -> {  : N <= i or (N > i and N >= 0) }
@@ -18,8 +17,7 @@
 ;
 
 
-; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
-; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 -pass-remarks-output=%t.yaml
+; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output -pass-remarks-output=%t.yaml < %s 2>&1
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 ; YAML: --- !Analysis
 ; YAML: Pass:            polly-scops
diff --git a/polly/test/ScopInfo/variant_base_pointer.ll b/polly/test/ScopInfo/variant_base_pointer.ll
index 32cb114fab05a..36beaf5f0f016 100644
--- a/polly/test/ScopInfo/variant_base_pointer.ll
+++ b/polly/test/ScopInfo/variant_base_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -disable-output < %s
 ;
 ; %tmp is added to the list of required hoists by -polly-scops and just
 ; assumed to be hoisted. Only -polly-scops recognizes it to be unhoistable
diff --git a/polly/test/ScopInfo/variant_load_empty_domain.ll b/polly/test/ScopInfo/variant_load_empty_domain.ll
index 6a28bd0405fdd..5602c443b25d3 100644
--- a/polly/test/ScopInfo/variant_load_empty_domain.ll
+++ b/polly/test/ScopInfo/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT: }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_0.ll b/polly/test/ScopInfo/wraping_signed_expr_0.ll
index f5f06bfd7d336..3a663f57c2774 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_0.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_0.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, char N, char p) {
 ;      for (char i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/wraping_signed_expr_1.ll b/polly/test/ScopInfo/wraping_signed_expr_1.ll
index e04257acc2010..8963e86bc6157 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(long *A, long N, long p) {
 ;      for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_2.ll b/polly/test/ScopInfo/wraping_signed_expr_2.ll
index 2511c0d646086..97cb2c05b16a0 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_3.ll b/polly/test/ScopInfo/wraping_signed_expr_3.ll
index 2106bdf4c0686..50e2eda2ce574 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_3.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_4.ll b/polly/test/ScopInfo/wraping_signed_expr_4.ll
index 3ea17f6e266bf..4ddb43a01bf24 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_4.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, char N, char p) {
 ;      for (char i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_5.ll b/polly/test/ScopInfo/wraping_signed_expr_5.ll
index 90706a3d3bc46..440d32bab72a5 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_5.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; We should not generate runtime check for ((int)r1 + (int)r2) as it is known not
 ; to overflow. However (p + q) can, thus checks are needed.
diff --git a/polly/test/ScopInfo/wraping_signed_expr_6.ll b/polly/test/ScopInfo/wraping_signed_expr_6.ll
index 9cf67fc101805..7bec9533440fb 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_6.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_7.ll b/polly/test/ScopInfo/wraping_signed_expr_7.ll
index d18d2b2df3e12..2d836e191f858 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_7.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
index 84626861bd39b..4964a123d0be1 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
index b4dd567bafa6b..a6db7c06d072c 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll
index cbe4af05169f8..b509951bbf0d5 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned *restrict I, unsigned *restrict A, unsigned N, unsigned M) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
index b306045276765..ea3356e01cc9f 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned long *restrict I, unsigned *restrict A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_space_mismatch.ll b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
index 3c02ae295b5ba..9fd1afae4b889 100644
--- a/polly/test/ScopInfo/zero_ext_space_mismatch.ll
+++ b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [dim] -> {  : dim > 0 }
diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll
index 5c0cfa103f0bf..85198b728a9bb 100644
--- a/polly/test/ScopInliner/ignore-declares.ll
+++ b/polly/test/ScopInliner/ignore-declares.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s
 
 ; Check that we do not crash if there are declares. We should skip function
 ; declarations and not try to query for domtree.
diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll
index 58c556a455fb9..6046fc0f38650 100644
--- a/polly/test/ScopInliner/invariant-load-func.ll
+++ b/polly/test/ScopInliner/invariant-load-func.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we inline a function that requires invariant load hoisting
 ; correctly.
diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll
index f12798a3d831a..77a5ddda93adc 100644
--- a/polly/test/ScopInliner/simple-inline-loop.ll
+++ b/polly/test/ScopInliner/simple-inline-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s  2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s | FileCheck %s
 
 ; Check that we get the 2 nested loops by inlining `to_be_inlined` into
 ; `inline_site`.
diff --git a/polly/test/Simplify/coalesce_3partials.ll b/polly/test/Simplify/coalesce_3partials.ll
index 4112787e51bfa..5411b6e430c66 100644
--- a/polly/test/Simplify/coalesce_3partials.ll
+++ b/polly/test/Simplify/coalesce_3partials.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine 3 partial accesses into one.
 ;
diff --git a/polly/test/Simplify/coalesce_disjointelements.ll b/polly/test/Simplify/coalesce_disjointelements.ll
index b140f287e27f7..888daeff39d8d 100644
--- a/polly/test/Simplify/coalesce_disjointelements.ll
+++ b/polly/test/Simplify/coalesce_disjointelements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine four partial stores into two.
 ; The stores write to the same array, but never the same element.
diff --git a/polly/test/Simplify/coalesce_overlapping.ll b/polly/test/Simplify/coalesce_overlapping.ll
index ee716fc12f095..f492222461b34 100644
--- a/polly/test/Simplify/coalesce_overlapping.ll
+++ b/polly/test/Simplify/coalesce_overlapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with overlapping domains) into one.
 ;
diff --git a/polly/test/Simplify/coalesce_partial.ll b/polly/test/Simplify/coalesce_partial.ll
index aea691f43e934..4df91d43fc46d 100644
--- a/polly/test/Simplify/coalesce_partial.ll
+++ b/polly/test/Simplify/coalesce_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with disjoint domains) into one.
 ;
diff --git a/polly/test/Simplify/dead_access_load.ll b/polly/test/Simplify/dead_access_load.ll
index 66f94795ea6e4..399c02381c890 100644
--- a/polly/test/Simplify/dead_access_load.ll
+++ b/polly/test/Simplify/dead_access_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead load-instruction
 ; (an load whose result is not used anywhere)
diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll
index fb40e4cc45b35..9344a284b311a 100644
--- a/polly/test/Simplify/dead_access_phi.ll
+++ b/polly/test/Simplify/dead_access_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead PHI write/read pair
 ; (accesses that are effectively not used)
diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll
index a8ff7f28542b7..6db242c97dac0 100644
--- a/polly/test/Simplify/dead_access_value.ll
+++ b/polly/test/Simplify/dead_access_value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead value write/read pair
 ; (accesses that are effectively not used)
diff --git a/polly/test/Simplify/dead_instruction.ll b/polly/test/Simplify/dead_instruction.ll
index 81e55e1c7bb30..785b5ba154187 100644
--- a/polly/test/Simplify/dead_instruction.ll
+++ b/polly/test/Simplify/dead_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead instruction
 ; (an instruction whose result is not used anywhere)
diff --git a/polly/test/Simplify/emptyaccessdomain.ll b/polly/test/Simplify/emptyaccessdomain.ll
index 9b06cec965a9d..917ae7f7d2c94 100644
--- a/polly/test/Simplify/emptyaccessdomain.ll
+++ b/polly/test/Simplify/emptyaccessdomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; for (int j = 0; j < n; j += 1) {
 ;   A[0] = 42.0;
diff --git a/polly/test/Simplify/exit_phi_accesses-2.ll b/polly/test/Simplify/exit_phi_accesses-2.ll
index 379c7e0ace0a3..d56fed4848ff3 100644
--- a/polly/test/Simplify/exit_phi_accesses-2.ll
+++ b/polly/test/Simplify/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s | FileCheck %s
 ;
 ; The use of %sum.next by %phi counts as an escaping use.
 ; Don't remove the scalar write of %sum.next.
diff --git a/polly/test/Simplify/func-b320a7.ll b/polly/test/Simplify/func-b320a7.ll
index 5aa2caba95cfc..65aa9cd28314e 100644
--- a/polly/test/Simplify/func-b320a7.ll
+++ b/polly/test/Simplify/func-b320a7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>,polly-optree' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 
 ; llvm.org/PR47098
 ; Use-after-free by reference to Stmt remaining in InstStmtMap after removing it has been removed by Scop::simplifyScop.
diff --git a/polly/test/Simplify/gemm.ll b/polly/test/Simplify/gemm.ll
index 5120de2db7677..6e3a43e0ebbad 100644
--- a/polly/test/Simplify/gemm.ll
+++ b/polly/test/Simplify/gemm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ;    void gemm(float A[][1024], float B[][1024], float C[][1024]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/Simplify/nocoalesce_differentvalues.ll b/polly/test/Simplify/nocoalesce_differentvalues.ll
index 33d04b2f96de8..cba62549227ae 100644
--- a/polly/test/Simplify/nocoalesce_differentvalues.ll
+++ b/polly/test/Simplify/nocoalesce_differentvalues.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that write different values.
 ;
diff --git a/polly/test/Simplify/nocoalesce_elementmismatch.ll b/polly/test/Simplify/nocoalesce_elementmismatch.ll
index 608b055e691df..b589d13779e52 100644
--- a/polly/test/Simplify/nocoalesce_elementmismatch.ll
+++ b/polly/test/Simplify/nocoalesce_elementmismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that do not write to different elements in the
 ; same instance.
diff --git a/polly/test/Simplify/nocoalesce_readbetween.ll b/polly/test/Simplify/nocoalesce_readbetween.ll
index e112b036cd778..b61ad9d8031e0 100644
--- a/polly/test/Simplify/nocoalesce_readbetween.ll
+++ b/polly/test/Simplify/nocoalesce_readbetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a read between them.
 ; Note: The read between is unused, so will be removed by markAndSweep.
diff --git a/polly/test/Simplify/nocoalesce_writebetween.ll b/polly/test/Simplify/nocoalesce_writebetween.ll
index fd5eee52eaf5c..be7d159554034 100644
--- a/polly/test/Simplify/nocoalesce_writebetween.ll
+++ b/polly/test/Simplify/nocoalesce_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a write between them.
 ;
diff --git a/polly/test/Simplify/notdead_region_exitphi.ll b/polly/test/Simplify/notdead_region_exitphi.ll
index 42fafb446cea3..1bd9bfe10a99d 100644
--- a/polly/test/Simplify/notdead_region_exitphi.ll
+++ b/polly/test/Simplify/notdead_region_exitphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node in a region's exit block.
 ;
diff --git a/polly/test/Simplify/notdead_region_innerphi.ll b/polly/test/Simplify/notdead_region_innerphi.ll
index 966448c9884b2..b59d6dc60b089 100644
--- a/polly/test/Simplify/notdead_region_innerphi.ll
+++ b/polly/test/Simplify/notdead_region_innerphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node within a region statement (%phi).
 ;
diff --git a/polly/test/Simplify/notredundant_region_loop.ll b/polly/test/Simplify/notredundant_region_loop.ll
index 88f6c41521739..859bd459f72d6 100644
--- a/polly/test/Simplify/notredundant_region_loop.ll
+++ b/polly/test/Simplify/notredundant_region_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the store in region_entry. It can be executed multiple times
 ; due to being part of a non-affine loop.
diff --git a/polly/test/Simplify/notredundant_region_middle.ll b/polly/test/Simplify/notredundant_region_middle.ll
index 43c05436809ba..a742ea889fb1f 100644
--- a/polly/test/Simplify/notredundant_region_middle.ll
+++ b/polly/test/Simplify/notredundant_region_middle.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove redundant stores in the middle of region statements.
 ; The store in region_true could be removed, but in practice we do try to
diff --git a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
index 8a9aec8be9e05..8542b7927f860 100644
--- a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
+++ b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the scalar value write of %i.trunc in inner.for.
 ; It is used by body.
diff --git a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
index 7218f328f9ca3..06b082c3f81fa 100644
--- a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; %tmp5 must keep the Value WRITE MemoryAccess, because as an incoming value of
 ; %tmp4, it is an "external use".
diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll
index eccdd8044d073..bc5b2dffd443d 100644
--- a/polly/test/Simplify/overwritten.ll
+++ b/polly/test/Simplify/overwritten.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ;
diff --git a/polly/test/Simplify/overwritten_3phi.ll b/polly/test/Simplify/overwritten_3phi.ll
index 4cee4f13d26d0..861c9acda3e9c 100644
--- a/polly/test/Simplify/overwritten_3phi.ll
+++ b/polly/test/Simplify/overwritten_3phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll
index c9f06c85dba53..cfd5a08143d60 100644
--- a/polly/test/Simplify/overwritten_3store.ll
+++ b/polly/test/Simplify/overwritten_3store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that even multiple stores are removed.
diff --git a/polly/test/Simplify/overwritten_implicit_and_explicit.ll b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
index b1b7635e26263..306e726e7808a 100644
--- a/polly/test/Simplify/overwritten_implicit_and_explicit.ll
+++ b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that this works even if one of the writes is a scalar MemoryKind.
diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll
index cdca2f11531e7..170838ddb8a1a 100644
--- a/polly/test/Simplify/overwritten_loadbetween.ll
+++ b/polly/test/Simplify/overwritten_loadbetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Do not remove overwrites when the value is read before.
 ;
diff --git a/polly/test/Simplify/overwritten_scalar.ll b/polly/test/Simplify/overwritten_scalar.ll
index 700adb6aed2ec..a1e7da40554d5 100644
--- a/polly/test/Simplify/overwritten_scalar.ll
+++ b/polly/test/Simplify/overwritten_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/pass_existence.ll b/polly/test/Simplify/pass_existence.ll
index 4d1d800b2a80b..6d9c99f9dc270 100644
--- a/polly/test/Simplify/pass_existence.ll
+++ b/polly/test/Simplify/pass_existence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-output "-passes=scop(print<polly-simplify>)" < %s -aa-pipeline=basic-aa < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<simplify>' -polly-print-simplify -aa-pipeline=basic-aa < %s < %s | FileCheck %s
 ;
 ; Simple test for the existence of the Simplify pass.
 ;
diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll
index 2bb05738955a3..ba1cffee1a0df 100644
--- a/polly/test/Simplify/phi_in_regionstmt.ll
+++ b/polly/test/Simplify/phi_in_regionstmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; The PHINode %cond91.sink.sink.us.sink.6 is in the middle of a region
 ; statement.
diff --git a/polly/test/Simplify/pr33323.ll b/polly/test/Simplify/pr33323.ll
index 22921d5fba509..5130eb8488ca2 100644
--- a/polly/test/Simplify/pr33323.ll
+++ b/polly/test/Simplify/pr33323.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR33323
 ;
diff --git a/polly/test/Simplify/redundant.ll b/polly/test/Simplify/redundant.ll
index 540e537460e54..f2489a74eb899 100644
--- a/polly/test/Simplify/redundant.ll
+++ b/polly/test/Simplify/redundant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination)
diff --git a/polly/test/Simplify/redundant_differentindex.ll b/polly/test/Simplify/redundant_differentindex.ll
index 5ce25836dedbd..efd20e90ae748 100644
--- a/polly/test/Simplify/redundant_differentindex.ll
+++ b/polly/test/Simplify/redundant_differentindex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; A store that has a different index than the load it is storing is
 ; not redundant.
diff --git a/polly/test/Simplify/redundant_partialwrite.ll b/polly/test/Simplify/redundant_partialwrite.ll
index ac5ca907fff6f..357b63206b0f5 100644
--- a/polly/test/Simplify/redundant_partialwrite.ll
+++ b/polly/test/Simplify/redundant_partialwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop-postfix=transformed -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;simplify>' -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a redundant store, if its partial domain is a subset of the
 ; read's domain.
diff --git a/polly/test/Simplify/redundant_region.ll b/polly/test/Simplify/redundant_region.ll
index 927aac6c4af05..c60d28b7039dd 100644
--- a/polly/test/Simplify/redundant_region.ll
+++ b/polly/test/Simplify/redundant_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_region_scalar.ll b/polly/test/Simplify/redundant_region_scalar.ll
index 72d570d46bdce..3de50c04b614f 100644
--- a/polly/test/Simplify/redundant_region_scalar.ll
+++ b/polly/test/Simplify/redundant_region_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_scalarwrite.ll b/polly/test/Simplify/redundant_scalarwrite.ll
index 84cb971be11fd..13ca40f8e1b87 100644
--- a/polly/test/Simplify/redundant_scalarwrite.ll
+++ b/polly/test/Simplify/redundant_scalarwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant scalar stores.
 ;
diff --git a/polly/test/Simplify/redundant_storebetween.ll b/polly/test/Simplify/redundant_storebetween.ll
index 6540d7751e469..47d9cfde2d3ce 100644
--- a/polly/test/Simplify/redundant_storebetween.ll
+++ b/polly/test/Simplify/redundant_storebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Don't remove store where there is another store to the same target
 ; in-between them.
diff --git a/polly/test/Simplify/scalability1.ll b/polly/test/Simplify/scalability1.ll
index c6e36f9dcdefb..969aade275af2 100644
--- a/polly/test/Simplify/scalability1.ll
+++ b/polly/test/Simplify/scalability1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/scalability2.ll b/polly/test/Simplify/scalability2.ll
index adcf9eef348a9..7951094867f2f 100644
--- a/polly/test/Simplify/scalability2.ll
+++ b/polly/test/Simplify/scalability2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_phi.ll b/polly/test/Simplify/sweep_mapped_phi.ll
index 495d77a22f618..ad41f2566e2b5 100644
--- a/polly/test/Simplify/sweep_mapped_phi.ll
+++ b/polly/test/Simplify/sweep_mapped_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %phi to A[j], so the scalar write in Stmt_for_bodyA can be removed.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_value.ll b/polly/test/Simplify/sweep_mapped_value.ll
index c83941a8f0ba5..a50c013ac7917 100644
--- a/polly/test/Simplify/sweep_mapped_value.ll
+++ b/polly/test/Simplify/sweep_mapped_value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %val to A[j], so the scalar write on Stmt_for_bodyB can be removed.
 ;
diff --git a/polly/test/Simplify/ununsed_read_in_region_entry.ll b/polly/test/Simplify/ununsed_read_in_region_entry.ll
index f2436c263a96a..4c05de975fdf8 100644
--- a/polly/test/Simplify/ununsed_read_in_region_entry.ll
+++ b/polly/test/Simplify/ununsed_read_in_region_entry.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>' -disable-output< %s | FileCheck %s -match-full-lines
-; RUN: opt %loadNPMPolly '-passes=polly-simplify,polly-codegen' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;simplify>' -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 ; for (int i = 0; i < n; i+=1) {
 ;    (void)A[0];
diff --git a/polly/test/Support/Plugins.ll b/polly/test/Support/Plugins.ll
index 872a32fad4fed..b75dd872ad404 100644
--- a/polly/test/Support/Plugins.ll
+++ b/polly/test/Support/Plugins.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -S < %s \
-; RUN: | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -S < %s | FileCheck %s
 
 ; This testcase tests plugin registration. Check-lines below serve to verify
 ; that the passes actually ran.
diff --git a/polly/test/Support/exportjson.ll b/polly/test/Support/exportjson.ll
index 22cfea23534cb..6bdf5a4c33cf3 100644
--- a/polly/test/Support/exportjson.ll
+++ b/polly/test/Support/exportjson.ll
@@ -1,6 +1,6 @@
 ; RUN: rm -rf %t
 ; RUN: mkdir -p %t
-; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t -polly -O2 -polly-export -S < %s
+; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t '-passes=polly-custom<export-jscop>' -disable-output < %s
 ; RUN: FileCheck %s -input-file %t/exportjson___%entry.split---%return.jscop
 ;
 ; for (int j = 0; j < n; j += 1) {
@@ -9,28 +9,22 @@
 ;
 define void @exportjson(i32 %n, ptr noalias nonnull %A) {
 entry:
-  br label %for
+  br label %entry.split
 
-for:
-  %j = phi i32 [0, %entry], [%j.inc, %inc]
-  %j.cmp = icmp slt i32 %j, %n
-  br i1 %j.cmp, label %body, label %exit
+entry.split:
+  %j.cmp1 = icmp sgt i32 %n, 0
+  br i1 %j.cmp1, label %body.lr.ph, label %return
 
-    body:
-      store double 42.0, ptr %A
-      br label %inc
-
-inc:
-  %j.inc = add nuw nsw i32 %j, 1
-  br label %for
-
-exit:
+body.lr.ph:
+  store double 4.200000e+01, ptr %A, align 8
   br label %return
 
 return:
   ret void
 }
 
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
+
 
 ; CHECK:      {
 ; CHECK-NEXT:    "arrays": [
diff --git a/polly/test/Support/isl-args.ll b/polly/test/Support/isl-args.ll
index 206cb73bfc5ab..6c8b2e97682e8 100644
--- a/polly/test/Support/isl-args.ll
+++ b/polly/test/Support/isl-args.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
-; RUN: not opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
+; RUN: not opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-asdf < %s 2>&1 | FileCheck %s -match-full-lines --check-prefix=UNKNOWN
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
 
 ; VERSION: isl-{{.*}}-IMath-32
 ; HELP: Usage: -polly-isl-arg [OPTION...]
diff --git a/polly/test/Support/pipelineposition.ll b/polly/test/Support/pipelineposition.ll
index a4506ba1d64ed..1ddfb5879ce16 100644
--- a/polly/test/Support/pipelineposition.ll
+++ b/polly/test/Support/pipelineposition.ll
@@ -1,8 +1,6 @@
-; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early                    -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE
-; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1
-; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer        -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3
-;
-; REQUIRES: asserts
+; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early                    -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE
+; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1
+; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer        -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3
 ;
 ; void callee(int n, double A[], int i) {
 ;   for (int j = 0; j < n; j += 1)
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
index f22063e796def..ca901b8825ced 100644
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -38,14 +38,10 @@ if config.llvm_polly_link_into_tools == '' or \
    config.llvm_polly_link_into_tools.lower() == 'false' or \
    config.llvm_polly_link_into_tools.lower() == 'notfound' or \
    config.llvm_polly_link_into_tools.lower() == 'llvm_polly_link_into_tools-notfound':
-    config.substitutions.append(('%loadPolly', '-load '
-                                 + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@'
-                                 + commonOpts ))
     config.substitutions.append(('%loadNPMPolly', '-load-pass-plugin '
                                  + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@'
                                  + commonOpts ))
 else:
-    config.substitutions.append(('%loadPolly', commonOpts ))
     config.substitutions.append(('%loadNPMPolly', commonOpts ))
 
 import lit.llvm
diff --git a/polly/test/polly.ll b/polly/test/polly.ll
index 2e455b39a9cd4..0f5467b0e654d 100644
--- a/polly/test/polly.ll
+++ b/polly/test/polly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -S < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @foo() nounwind {
 start:

From cf11ff9084ca703a2e32e396696007cd0799086f Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 3 Nov 2025 14:35:01 -0800
Subject: [PATCH 094/313] Exclude another StructurizeCFG test from profcheck
 (#166248)

Haven't yet addressed this pass
---
 llvm/utils/profcheck-xfail.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 61bc936cd151a..d7af3a7ecbdee 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1297,6 +1297,7 @@ Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
+Transforms/StructurizeCFG/callbr.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
 Transforms/StructurizeCFG/loop-break-phi.ll
 Transforms/StructurizeCFG/nested-loop-order.ll

From 01221874e41e0cba9161fda3601d10aa36537512 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 3 Nov 2025 14:37:41 -0800
Subject: [PATCH 095/313] [SLU][profcheck] Use the original branch weigths in
 `buildPartialInvariantUnswitchConditionalBranch` (#164270)

A new branch is created on the same condition as a branch for which we have a profile. We can reuse that profile in this case.

Issue #147390
---
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  | 19 ++++-
 .../SimpleLoopUnswitch/partial-unswitch.ll    | 75 +++++++++++--------
 2 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index bb6c879f4d47e..239526e85e1fd 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch(
 static void buildPartialInvariantUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) {
   ValueToValueMapTy VMap;
   for (auto *Val : reverse(ToDuplicate)) {
     Instruction *Inst = cast<Instruction>(Val);
@@ -377,8 +377,19 @@ static void buildPartialInvariantUnswitchConditionalBranch(
   IRBuilder<> IRB(&BB);
   IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
   Value *Cond = VMap[ToDuplicate[0]];
-  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
-                   Direction ? &NormalSucc : &UnswitchedSucc);
+  // The expectation is that ToDuplicate[0] is the condition used by the
+  // OriginalBranch, case in which we can clone the profile metadata from there.
+  auto *ProfData =
+      !ProfcheckDisableMetadataFixes &&
+              ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition())
+          ? OriginalBranch.getMetadata(LLVMContext::MD_prof)
+          : nullptr;
+  auto *BR =
+      IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+                       Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
+  if (!ProfData)
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
+                                                DEBUG_TYPE);
 }
 
 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -2515,7 +2526,7 @@ static void unswitchNontrivialInvariants(
     // the branch in the split block.
     if (PartiallyInvariant)
       buildPartialInvariantUnswitchConditionalBranch(
-          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
+          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI);
     else {
       buildPartialUnswitchConditionalBranch(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index 1d8942079ffd8..87161707d9f69 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -1,14 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
 
 declare void @clobber()
 
-define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
+define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) !prof !0 {
 ; CHECK-LABEL: @partial_unswitch_true_successor(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 100
-; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       entry.split.us:
 ; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
 ; CHECK:       loop.header.us:
@@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch.us:
 ; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       exit.split.us:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       entry.split:
@@ -28,7 +28,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[SC:%.*]] = icmp eq i32 [[LV]], 100
-; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]]
+; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]], !prof [[PROF1]]
 ; CHECK:       noclobber:
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       clobber:
@@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !prof [[PROF2]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -50,7 +50,7 @@ loop.header:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
   %lv = load i32, ptr %ptr
   %sc = icmp eq i32 %lv, 100
-  br i1 %sc, label %noclobber, label %clobber
+  br i1 %sc, label %noclobber, label %clobber, !prof !1
 
 noclobber:
   br label %loop.latch
@@ -62,7 +62,7 @@ clobber:
 loop.latch:
   %c = icmp ult i32 %iv, %N
   %iv.next = add i32 %iv, 1
-  br i1 %c, label %loop.header, label %exit
+  br i1 %c, label %loop.header, label %exit, !prof !2
 
 exit:
   ret i32 10
@@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit.loopexit.split:
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       exit.loopexit:
@@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1138,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32
 ; CHECK-NEXT:    store i32 [[TMP1:%.*]], ptr [[PTR]], align 16
 ; CHECK-NEXT:    br label [[EXITING]]
 ; CHECK:       exiting:
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1249,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 %
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1360,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1425,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1456,15 +1456,26 @@ exit:
   ret i32 10
 }
 
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
-; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
-; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]}
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1000, i32 1}
+!2 = !{!"branch_weights", i32 100, i32 3}
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 100, i32 3}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+; CHECK: [[META4]] = !{!"llvm.loop.unswitch.partial.disable"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+;.

From 148a42bdd2f252b4366b79fc518356bb06cacac3 Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Mon, 3 Nov 2025 14:40:01 -0800
Subject: [PATCH 096/313] [sanitizer-common] [Darwin] Provide warnings for
 common sandbox issues (#165907)

We currently do not handle errors in task_set_exc_guard_behavior. If
this fails, mmap can unexpectedly crash.
We also do not currently provide a clear warning if no external
symbolizers are found.

rdar://163798535
---
 compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp   | 12 +++++++++++-
 .../sanitizer_symbolizer_posix_libcdep.cpp           |  7 +++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index b0a29db908639..90c0b66f81b5b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -960,7 +960,17 @@ static void DisableMmapExcGuardExceptions() {
       RTLD_DEFAULT, "task_set_exc_guard_behavior");
   if (set_behavior == nullptr) return;
   const task_exc_guard_behavior_t task_exc_guard_none = 0;
-  set_behavior(mach_task_self(), task_exc_guard_none);
+  kern_return_t res = set_behavior(mach_task_self(), task_exc_guard_none);
+  if (res != KERN_SUCCESS) {
+    Report(
+        "WARN: task_set_exc_guard_behavior returned %d (%s), "
+        "mmap may fail unexpectedly.\n",
+        res, mach_error_string(res));
+    if (res == KERN_DENIED)
+      Report(
+          "HINT: Check that task_set_exc_guard_behavior is allowed by "
+          "sandbox.\n");
+  }
 }
 
 static void VerifyInterceptorsWorking();
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index f8d821e125b7a..7eb0c9756d64a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -505,6 +505,13 @@ static void ChooseSymbolizerTools(IntrusiveList<SymbolizerTool> *list,
   }
 
 #  if SANITIZER_APPLE
+  if (list->empty()) {
+    Report(
+        "WARN: No external symbolizers found. Symbols may be missing or "
+        "unreliable.\n");
+    Report(
+        "HINT: Is PATH set? Does sandbox allow file-read of /usr/bin/atos?\n");
+  }
   VReport(2, "Using dladdr symbolizer.\n");
   list->push_back(new (*allocator) DlAddrSymbolizer());
 #  endif  // SANITIZER_APPLE

From c0cb5133489fa746ae5de43a17991778281fe4c9 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Mon, 3 Nov 2025 14:42:05 -0800
Subject: [PATCH 097/313] [lld][WebAssembly] Use writePtrConst helper function
 (#166228)

This is especially important for writing i32 values larger than 2gb
which need to be encoded as negative SLEB vales in the binary.

Without this change offsets over 2gb are wrongly encoded and cause
validation errors.

Fixes: https://github.com/emscripten-core/emscripten/issues/25706
---
 lld/test/wasm/runtime-relocations-himem.s | 60 +++++++++++++++++++++++
 lld/wasm/InputChunks.cpp                  | 14 ++----
 lld/wasm/SyntheticSections.cpp            |  8 +--
 3 files changed, 66 insertions(+), 16 deletions(-)
 create mode 100644 lld/test/wasm/runtime-relocations-himem.s

diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s
new file mode 100644
index 0000000000000..a12a93a6cb933
--- /dev/null
+++ b/lld/test/wasm/runtime-relocations-himem.s
@@ -0,0 +1,60 @@
+## Verifies runtime relocation code for addresses over 2gb works correctly.
+## We have had issues with LEB encoding of address over 2gb in i32.const
+## instruction leading to invalid binaries.
+
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o
+# XUN: obj2yaml %t.wasm | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --
+
+.globl tls_sym
+.globl data_sym
+.globl _start
+.globaltype __tls_base, i32
+
+_start:
+  .functype _start () -> ()
+  global.get __tls_base
+  i32.const tls_sym@TLSREL
+  i32.add
+  drop
+  i32.const data_sym
+  drop
+  end_function
+
+.section tls_sec,"T",@
+.p2align  2
+tls_sym:
+  .int32 0
+  .int32 extern_sym
+  .size tls_sym, 8
+
+.section data_sec,"",@
+.p2align  2
+data_sym:
+  .int32 0
+  .int32 extern_sym
+  .size data_sym, 8
+
+.section  .custom_section.target_features,"",@
+  .int8 2
+  .int8 43
+  .int8 7
+  .ascii  "atomics"
+  .int8 43
+  .int8 11
+  .ascii  "bulk-memory"
+
+# CHECK: <__wasm_apply_data_relocs>:
+# CHECK-EMPTY:
+# CHECK-NEXT:  i32.const -2147483636
+# CHECK-NEXT:  global.get 0
+# CHECK-NEXT:  i32.store 0
+# CHECK-NEXT:  end
+
+# CHECK: <__wasm_apply_tls_relocs>:
+# CHECK-EMPTY:
+# CHECK-NEXT:  i32.const -2147483644
+# CHECK-NEXT:  global.get 0
+# CHECK-NEXT:  i32.store 0
+# CHECK-NEXT:  end
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index 44927e7a432bc..14e02e6009318 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -423,8 +423,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
 
   bool is64 = ctx.arg.is64.value_or(false);
   bool generated = false;
-  unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
-                                   : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
                                  : WASM_OPCODE_I32_ADD;
 
@@ -451,8 +449,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
                       << " output offset=" << offset << "\n");
 
     // Calculate the address at which to apply the relocation
-    writeU8(os, opcode_ptr_const, "CONST");
-    writeSleb128(os, offset, "offset");
+    writePtrConst(os, offset, is64, "offset");
 
     // In PIC mode we need to add the __memory_base
     if (ctx.isPic) {
@@ -466,8 +463,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
 
     // Now figure out what we want to store at this location
     bool is64 = relocIs64(rel.Type);
-    unsigned opcode_reloc_const =
-        is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST;
     unsigned opcode_reloc_add =
         is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD;
     unsigned opcode_reloc_store =
@@ -477,8 +472,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, sym->getGOTIndex(), "global index");
       if (rel.Addend) {
-        writeU8(os, opcode_reloc_const, "CONST");
-        writeSleb128(os, rel.Addend, "addend");
+        writePtrConst(os, rel.Addend, is64, "addend");
         writeU8(os, opcode_reloc_add, "ADD");
       }
     } else {
@@ -491,8 +485,8 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
         baseSymbol = ctx.sym.tlsBase;
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, baseSymbol->getGlobalIndex(), "base");
-      writeU8(os, opcode_reloc_const, "CONST");
-      writeSleb128(os, file->calcNewValue(rel, tombstone, this), "offset");
+      writePtrConst(os, file->calcNewValue(rel, tombstone, this), is64,
+                    "offset");
       writeU8(os, opcode_reloc_add, "ADD");
     }
 
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index e1192706ea913..399a5084e6595 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -434,8 +434,6 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) {
 void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
   assert(!ctx.arg.extendedConst);
   bool is64 = ctx.arg.is64.value_or(false);
-  unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
-                                   : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
                                  : WASM_OPCODE_I32_ADD;
 
@@ -452,8 +450,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
         writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base");
 
       // Add the virtual address of the data symbol
-      writeU8(os, opcode_ptr_const, "CONST");
-      writeSleb128(os, d->getVA(), "offset");
+      writePtrConst(os, d->getVA(), is64, "offset");
     } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) {
       if (f->isStub)
         continue;
@@ -462,8 +459,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
       writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base");
 
       // Add the table index to __table_base
-      writeU8(os, opcode_ptr_const, "CONST");
-      writeSleb128(os, f->getTableIndex(), "offset");
+      writePtrConst(os, f->getTableIndex(), is64, "offset");
     } else {
       assert(isa<UndefinedData>(sym) || isa<SharedData>(sym));
       continue;

From 562e3bfcd45cddc1da133780b401564471c8c66d Mon Sep 17 00:00:00 2001
From: YongKang Zhu <yongzhu@fb.com>
Date: Mon, 3 Nov 2025 14:44:05 -0800
Subject: [PATCH 098/313] [BOLT] Add an option for constant island cloning
 (#165778)

Avoid cloning constant island helps to reduce app size, especially for
BOLT optimization in which cloning would happen when a function is split
into multiple fragments. Add an option to make the cloning optional, and
we will introduce a new pass to handle the reference too far error that
may result from disabling constant island cloning (#165787).
---
 bolt/lib/Core/BinaryContext.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index c7cd034a30410..7af32c8c56635 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -78,6 +78,11 @@ cl::opt<std::string> CompDirOverride(
              "to *.dwo files."),
     cl::Hidden, cl::init(""), cl::cat(BoltCategory));
 
+static cl::opt<bool> CloneConstantIsland("clone-constant-island",
+                                         cl::desc("clone constant islands"),
+                                         cl::Hidden, cl::init(true),
+                                         cl::ZeroOrMore, cl::cat(BoltCategory));
+
 static cl::opt<bool>
     FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false),
                          cl::desc("treat invalid code padding as error"),
@@ -461,7 +466,8 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
       // of dynamic relocs, as we currently do not support cloning them.
       // Notice: we might fail to link because of this, if the original constant
       // island we are referring would be emitted too far away.
-      if (IslandIter->second->hasDynamicRelocationAtIsland()) {
+      if (IslandIter->second->hasDynamicRelocationAtIsland() ||
+          !opts::CloneConstantIsland) {
         MCSymbol *IslandSym =
             IslandIter->second->getOrCreateIslandAccess(Address);
         if (IslandSym)
@@ -469,6 +475,12 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
       } else if (MCSymbol *IslandSym =
                      IslandIter->second->getOrCreateProxyIslandAccess(Address,
                                                                       BF)) {
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: clone constant island at address 0x"
+                   << Twine::utohexstr(IslandIter->first) << " with size of 0x"
+                   << Twine::utohexstr(
+                          IslandIter->second->estimateConstantIslandSize())
+                   << " bytes, referenced by " << BF << "\n");
         BF.createIslandDependency(IslandSym, IslandIter->second);
         return std::make_pair(IslandSym, 0);
       }

From 475c632b17a9f9f9ae9428c7621687e255710b7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 3 Nov 2025 12:47:50 -1000
Subject: [PATCH 099/313] [flang][cuda] Use local scope to avoid duplicate
 definition (#166249)

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 23 +++++++++++--------
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |  4 ++--
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6be4d9ce0a46c..2db0606d2bc9e 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -3433,13 +3433,15 @@ IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
   builder.setInsertionPointToStart(afterBlock);
   auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
   auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  mlir::Value ret =
-      mlir::NVVM::InlinePtxOp::create(
-          builder, loc, {resultType}, {barrier, args[1], ns}, {},
-          ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
-          "selp.b32 %0, 1, 0, p;",
-          {})
-          .getResult(0);
+  mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
+                        builder, loc, {resultType}, {barrier, args[1], ns}, {},
+                        "{\n"
+                        "  .reg .pred p;\n"
+                        "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+                        "  selp.b32 %0, 1, 0, p;\n"
+                        "}",
+                        {})
+                        .getResult(0);
   mlir::scf::YieldOp::create(builder, loc, ret);
   builder.setInsertionPointAfter(whileOp);
   return whileOp.getResult(0);
@@ -3454,8 +3456,11 @@ IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
   auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
   return mlir::NVVM::InlinePtxOp::create(
              builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
-             ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
-             "selp.b32 %0, 1, 0, p;",
+             "{\n"
+             "  .reg .pred p;\n"
+             "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+             "  selp.b32 %0, 1, 0, p;\n"
+             "}",
              {})
       .getResult(0);
 }
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 674548b7489e8..ed015df263070 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -519,7 +519,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
 ! CHECK: scf.while
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A  .reg .pred p;\0A  mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A  selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
 
 attributes(global) subroutine test_barrier_try_wait_sleep()
   integer :: istat
@@ -530,7 +530,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A  .reg .pred p;\0A  mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A  selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
 
 attributes(global) subroutine test_tma_bulk_load_c4(a, n)
   integer(8), shared :: barrier1

From 2a42a85f5bf8202f942a7d638c1abde6dbba0db6 Mon Sep 17 00:00:00 2001
From: Stefan Mada <smada@nvidia.com>
Date: Mon, 3 Nov 2025 15:02:59 -0800
Subject: [PATCH 100/313] [MLIR][NVVM] Add support for Convert Ops with
 rs-rounding mode (#165736)

Added NVVM dialect operations for stochastic rounding (.rs) conversions
from F32 to various packed floating-point formats. These operations map
to existing PTX instructions and LLVM intrinsics.

Supported conversions:
- F32x2 to F16x2/BF16x2 (with optional relu and satfinite modifiers)
- F32x4 to packed F8 formats (E4M3, E5M2)
- F32x4 to packed F6 formats (E2M3, E3M2)
- F32x4 to packed F4 format (E2M1)

All operations support stochastic rounding with randomness provided via
an rbits parameter, and optional relu and saturation modifiers.
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  93 ++++++++-
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    | 132 +++++++++++++
 .../invalid-convert-stochastic-rounding.mlir  |  90 +++++++++
 .../nvvm/convert_stochastic_rounding.mlir     | 182 ++++++++++++++++++
 4 files changed, 496 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ba5e48e4ec9ba..46fdf5441bc13 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1589,10 +1589,11 @@ def FPRoundingModeRM   : I32EnumAttrCase<"RM",   2, "rm">;
 def FPRoundingModeRP   : I32EnumAttrCase<"RP",   3, "rp">;
 def FPRoundingModeRZ   : I32EnumAttrCase<"RZ",   4, "rz">;
 def FPRoundingModeRNA  : I32EnumAttrCase<"RNA",  5, "rna">;
+def FPRoundingModeRS   : I32EnumAttrCase<"RS",   6, "rs">;
 
 def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind",
   [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM,
-    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA]> {
+    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
 }
@@ -1906,6 +1907,96 @@ def NVVM_ConvertF6x2ToF16x2Op :
 def NVVM_ConvertF4x2ToF16x2Op :
   NVVM_ConvertToFP16x2Op_Base<"F4", I8, "F16">;
 
+//===----------------------------------------------------------------------===//
+// NVVM Stochastic Rounding Conversion Ops
+//===----------------------------------------------------------------------===//
+
+// Base class for conversions from F32x2 to FPx2 formats
+// (F16x2, BF16x2)
+// TODO: In separate PR, add .rn and .rz rounding variants for this conversion
+// as currently only support .rs rounding mode
+class NVVM_ConvertF32x2ToFPx2OpBase<string dstFormat, string mnemonic, Type dstType> :
+  NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>,
+  Results<(outs dstType:$dst)>,
+  Arguments<(ins F32:$src_hi, F32:$src_lo, I32:$rbits,
+                 DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::RS">:$rnd,
+                 DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+                 DefaultValuedAttr<BoolAttr, "false">:$relu)> {
+  let summary = "Convert two F32 values to packed " # dstFormat # " with stochastic rounding (.rs)";
+  let description = [{
+    Converts two F32 values to packed }] # dstFormat # [{ format using stochastic 
+    rounding (.rs) mode with randomness provided by the `rbits` parameter. The 
+    `relu` attribute clamps negative results to 0. The `sat` attribute determines 
+    saturation behavior. The `src_hi` and `src_lo` parameters correspond to operands 
+    `a` and `b` in the PTX ISA, respectively.
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  
+  let assemblyFormat = "$src_hi `,` $src_lo `,` $rbits attr-dict `:` type($dst)";
+
+  let hasVerifier = 1;
+  
+  let extraClassDeclaration = [{
+    llvm::Intrinsic::ID getIntrinsicID();
+  }];
+  
+  string llvmBuilder = [{
+    auto intId = op.getIntrinsicID();
+    $dst = createIntrinsicCall(builder, intId, {$src_hi, $src_lo, $rbits});
+  }];
+  }
+
+// F32x2 -> F16x2 with stochastic rounding
+def NVVM_ConvertF32x2ToF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"f16x2", "convert.f32x2.to.f16x2", VectorOfLengthAndType<[2], [F16]>>;
+
+// F32x2 -> BF16x2 with stochastic rounding
+def NVVM_ConvertF32x2ToBF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"bf16x2", "convert.f32x2.to.bf16x2", VectorOfLengthAndType<[2], [BF16]>>;
+
+// Base class for stochastic rounding conversions from F32x4 to FPx4 formats
+// (E4M3x4, E5M2x4, E2M3x4, E3M2x4, E2M1x4)
+// These operations always use RS (stochastic rounding) mode with SATFINITE saturation.
+class NVVM_ConvertF32x4ToFPx4OpBase<string dstFormat, string mnemonic, Type dstType> :
+  NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>,
+  Results<(outs dstType:$dst)>,
+  Arguments<(ins VectorOfLengthAndType<[4], [F32]>:$src, I32:$rbits,
+                 DefaultValuedAttr<BoolAttr, "false">:$relu,
+                 TypeAttr:$dstTy)> {
+  let summary = "Convert vector<4xf32> to packed " # dstFormat # " with stochastic rounding (.rs) and satfinite";
+  let description = [{
+    Converts a vector<4xf32> to packed }] # dstFormat # [{ format using 
+    stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is 
+    provided by the `rbits` parameter. The `dstTy` attribute specifies the 
+    target floating-point format. The `relu` attribute clamps negative results to 0.
+    
+    Note: These operations always use RS rounding mode and SATFINITE saturation mode.
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  
+  let assemblyFormat = "$src `,` $rbits attr-dict `:` type($src) `->` type($dst) `(` $dstTy `)`";
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    llvm::Intrinsic::ID getIntrinsicID();
+  }];
+  
+  string llvmBuilder = [{
+    auto intId = op.getIntrinsicID();
+    $dst = createIntrinsicCall(builder, intId, {$src, $rbits});
+  }];
+}
+
+// F32x4 -> F8x4 with stochastic rounding (supports E4M3FN, E5M2)
+def NVVM_ConvertF32x4ToF8x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f8x4", "convert.f32x4.to.f8x4", VectorOfLengthAndType<[4], [I8]>>;
+
+// F32x4 -> F6x4 with stochastic rounding (supports E2M3FN, E3M2FN)
+def NVVM_ConvertF32x4ToF6x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f6x4", "convert.f32x4.to.f6x4", VectorOfLengthAndType<[4], [I8]>>;
+
+// F32x4 -> F4x4 with stochastic rounding (supports E2M1FN)
+def NVVM_ConvertF32x4ToF4x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f4x4", "convert.f32x4.to.f4x4", I16>;
+
 //===----------------------------------------------------------------------===//
 // NVVM MMA Ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index a5ffb9e77fa9d..12c81629d7e76 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -365,6 +365,59 @@ LogicalResult ConvertF4x2ToF16x2Op::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Stochastic Rounding Conversion Ops
+//===----------------------------------------------------------------------===//
+
+LogicalResult ConvertF32x2ToF16x2Op::verify() {
+  if (getRnd() != FPRoundingMode::RS)
+    return emitOpError("Only RS rounding mode is supported for "
+                       "conversions from f32x2 to f16x2.");
+  return success();
+}
+
+LogicalResult ConvertF32x2ToBF16x2Op::verify() {
+  if (getRnd() != FPRoundingMode::RS)
+    return emitOpError("Only RS rounding mode is supported for "
+                       "conversions from f32x2 to bf16x2.");
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF8x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float8E4M3FNType::get(ctx) << " and "
+           << mlir::Float8E5M2Type::get(ctx)
+           << " types are supported for conversions from f32x4 to f8x4.";
+
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF6x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float6E2M3FNType, mlir::Float6E3M2FNType>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float6E2M3FNType::get(ctx) << " and "
+           << mlir::Float6E3M2FNType::get(ctx)
+           << " types are supported for conversions from f32x4 to f6x4.";
+
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF4x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float4E2M1FNType>(getDstTy()))
+    return emitOpError("Only ") << mlir::Float4E2M1FNType::get(ctx)
+                                << " type is supported for conversions from "
+                                   "f32x4 to f4x4.";
+
+  return success();
+}
+
 LogicalResult BulkStoreOp::verify() {
   if (getInitVal() != 0)
     return emitOpError("only 0 is supported for initVal, got ") << getInitVal();
@@ -2469,6 +2522,85 @@ Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op,
     return TCGEN05_CP_2CTA(shape_mc, , is_2cta);                               \
   }()
 
+llvm::Intrinsic::ID ConvertF32x2ToF16x2Op::getIntrinsicID() {
+  bool hasRelu = getRelu();
+  bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE);
+
+  if (hasRelu && hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu_satfinite;
+  if (hasRelu)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu;
+  if (hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_satfinite;
+  return llvm::Intrinsic::nvvm_ff2f16x2_rs;
+}
+
+llvm::Intrinsic::ID ConvertF32x2ToBF16x2Op::getIntrinsicID() {
+  bool hasRelu = getRelu();
+  bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE);
+
+  if (hasRelu && hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu_satfinite;
+  if (hasRelu)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu;
+  if (hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_satfinite;
+  return llvm::Intrinsic::nvvm_ff2bf16x2_rs;
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF8x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite;
+      })
+      .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F8 type in ConvertF32x4ToF8x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF6x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float6E2M3FNType>([&](mlir::Float6E2M3FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite;
+      })
+      .Case<mlir::Float6E3M2FNType>([&](mlir::Float6E3M2FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F6 type in ConvertF32x4ToF6x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF4x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float4E2M1FNType>([&](mlir::Float4E2M1FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F4 type in ConvertF32x4ToF4x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
 llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) {
   auto curOp = cast<NVVM::Tcgen05CpOp>(op);
   bool is2CTA = curOp.getGroup() == CTAGroupKind::CTA_2;
diff --git a/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
new file mode 100644
index 0000000000000..35f5e1b3c8ba2
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Test invalid target architecture (sm_100 instead of sm_100a)
+gpu.module @invalid_arch_sm_100 [#nvvm.target<chip = "sm_100">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0x12345678 : i32) : i32
+    // expected-error@+1 {{'nvvm.convert.f32x2.to.f16x2' op is not supported on sm_100}}
+    %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16>
+    return
+  }
+}
+
+// -----
+
+// Test that operations require stochastic rounding mode
+llvm.func @invalid_rnd_mode_f16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to f16x2.}}
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// -----
+
+llvm.func @invalid_rnd_mode_bf16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to bf16x2.}}
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// -----
+
+// Test invalid destination types for f8x4 (should only accept f8E4M3FN, f8E5M2)
+llvm.func @invalid_dst_type_f8x4_e3m4(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}}
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E3M4)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+llvm.func @invalid_dst_type_f8x4_e8m0(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}}
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E8M0FNU)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test invalid destination types for f6x4 (should only accept f6E2M3FN, f6E3M2FN)
+llvm.func @invalid_dst_type_f6x4_f8(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x4 to f6x4.}}
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test invalid destination type for f4x4 (should only accept f4E2M1FN)
+llvm.func @invalid_dst_type_f4x4_f6(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // expected-error@+1 {{Only 'f4E2M1FN' type is supported for conversions from f32x4 to f4x4.}}
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f6E2M3FN)
+  llvm.return %res : i16
+}
+
+// -----
+
+// Test invalid rounding modes for non-stochastic ops
+llvm.func @convert_float_to_tf32_rs_not_supported(%src : f32) -> i32 {
+  // expected-error @below {{Only {rn,rz,rna} rounding modes supported for ConvertFloatToTF32Op.}}
+  %res = nvvm.convert.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rs>}
+  llvm.return %res : i32
+}
+
+// -----
+
+llvm.func @convert_f32x2_to_f8x2_rs_not_supported(%a : f32, %b : f32) {
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rs>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
+  llvm.return
+}
+
+// -----
+
+llvm.func @convert_bf16x2_to_f8x2_rs_not_supported(%src : vector<2xbf16>) {
+  // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}}
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rs>} : vector<2xbf16> -> i16 (f8E8M0FNU)
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir
new file mode 100644
index 0000000000000..b5bb22350dcd7
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir
@@ -0,0 +1,182 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// -----
+
+// Test valid architectures work
+
+// Valid case on sm_100a
+gpu.module @valid_f16x2_rs_sm_100a [#nvvm.target<chip = "sm_100a">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0x12345678 : i32) : i32
+    %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16>
+    return
+  }
+}
+
+// Valid case on sm_103a
+gpu.module @valid_bf16x2_rs_sm_103a [#nvvm.target<chip = "sm_103a">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0 : i32) : i32
+    %res = nvvm.convert.f32x2.to.bf16x2 %f1, %f2, %rbits : vector<2xbf16>
+    return
+  }
+}
+
+// -----
+
+// Test F32x2 -> F16x2 with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs
+llvm.func @convert_f32x2_to_f16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB,  %rbits : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_satfinite
+llvm.func @convert_f32x2_to_f16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu
+llvm.func @convert_f32x2_to_f16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu_satfinite
+llvm.func @convert_f32x2_to_f16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// -----
+
+// Test F32x2 -> BF16x2 with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs
+llvm.func @convert_f32x2_to_bf16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB,  %rbits : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_satfinite
+llvm.func @convert_f32x2_to_bf16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu
+llvm.func @convert_f32x2_to_bf16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu_satfinite
+llvm.func @convert_f32x2_to_bf16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// -----
+
+// Test F32x4 -> F8x4 (E4M3) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs
+llvm.func @convert_f32x4_to_f8x4_e4m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs_relu
+llvm.func @convert_f32x4_to_f8x4_e4m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F8x4 (E5M2) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs
+llvm.func @convert_f32x4_to_f8x4_e5m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E5M2)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs_relu
+llvm.func @convert_f32x4_to_f8x4_e5m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E5M2)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F6x4 (E2M3) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs
+llvm.func @convert_f32x4_to_f6x4_e2m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E2M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs_relu
+llvm.func @convert_f32x4_to_f6x4_e2m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E2M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F6x4 (E3M2) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs
+llvm.func @convert_f32x4_to_f6x4_e3m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E3M2FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs_relu
+llvm.func @convert_f32x4_to_f6x4_e3m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E3M2FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F4x4 (E2M1) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs
+llvm.func @convert_f32x4_to_f4x4_e2m1_rs(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f4E2M1FN)
+  llvm.return %res : i16
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs_relu
+llvm.func @convert_f32x4_to_f4x4_e2m1_rs_relu(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits {relu = true} : vector<4xf32> -> i16 (f4E2M1FN)
+  llvm.return %res : i16
+}
+

From 615299934489953deaf202cc445ac9f8ad362afc Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 4 Nov 2025 00:04:36 +0100
Subject: [PATCH 101/313] [CodeGen][ARM64EC] Don't treat guest exit thunks as
 indirect calls (#165885)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Guest exit thunks serve as glue for performing direct calls, so they
shouldn’t treat the target as an indirect one.

Spotted by @coneco-cy in #165504.
---
 .../AArch64/AArch64Arm64ECCallLowering.cpp    | 14 ++----
 llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll  | 49 +++++++++++++++++--
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 1169f26a2ae37..97298f9d74171 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit);
   IRBuilder<> B(BB);
 
-  // Load the global symbol as a pointer to the check function.
-  Value *GuardFn;
-  if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf"))
-    GuardFn = GuardFnCFGlobal;
-  else
-    GuardFn = GuardFnGlobal;
-  LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn);
-
-  // Create new call instruction. The CFGuard check should always be a call,
-  // even if the original CallBase is an Invoke or CallBr instruction.
+  // Create new call instruction. The call check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instructio.
+  // This is treated as a direct call, so do not use GuardFnCFGlobal.
+  LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal);
   Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes());
   CallInst *GuardCheck = B.CreateCall(
       GuardFnType, GuardCheckLoad, {F, Thunk});
diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
index bdbc99e2d98b0..75e7ac902274d 100644
--- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
+++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
@@ -2,15 +2,58 @@
 
 declare void @called()
 declare void @escaped()
-define void @f(ptr %dst) {
+define void @f(ptr %dst, ptr readonly %f) {
   call void @called()
+; CHECK:         bl      "#called"
   store ptr @escaped, ptr %dst
-  ret void
+  call void %f()
+; CHECK:       adrp    x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  add     x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  str     x8, [x20]
+; CHECK-NEXT:  adrp    x8, __os_arm64x_check_icall_cfg
+; CHECK-NEXT:  ldr     x8, [x8, :lo12:__os_arm64x_check_icall_cfg]
+; CHECK-NEXT:  mov     x11,
+; CHECK-NEXT:  blr     x8
+; CHECK-NEXT:  blr     x11
+    ret void
 }
 
+; CHECK-LABEL:    .def "#called$exit_thunk";
+; CHECK-NEXT:     .scl 2;
+; CHECK-NEXT:     .type 32;
+; CHECK-NEXT:     .endef
+; CHECK-NEXT:     .section .wowthk$aa,"xr",discard,"#called$exit_thunk"
+; CHECK-NEXT:     .globl "#called$exit_thunk"            // -- Begin function #called$exit_thunk
+; CHECK-NEXT:     .p2align 2
+; CHECK-NEXT: "#called$exit_thunk":                   // @"#called$exit_thunk"
+; CHECK-NEXT:     .weak_anti_dep called
+; CHECK-NEXT: called = "#called"
+; CHECK-NEXT:     .weak_anti_dep "#called"
+; CHECK-NEXT: "#called" = "#called$exit_thunk"
+; CHECK-NEXT:    .seh_proc "#called$exit_thunk"
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT:     str x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp x11, called
+; CHECK-NEXT:     add x11, x11, :lo12:called
+; CHECK-NEXT:     ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     add x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     blr x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
 !llvm.module.flags = !{!0}
-!0 = !{i32 2, !"cfguard", i32 1}
+!0 = !{i32 2, !"cfguard", i32 2}
 
 ; CHECK-LABEL: .section .gfids$y,"dr"
 ; CHECK-NEXT:  .symidx escaped
+; CHECK-NEXT:  .symidx $iexit_thunk$cdecl$v$v
 ; CHECK-NOT:   .symidx

From 73ef4dd48fa3ef64a0f291cb822ab66289780b0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 3 Nov 2025 13:11:10 -1000
Subject: [PATCH 102/313] [flang][cuda] Add missing semi-colon in inlined ptx
 (#166254)

This would trigger error in ptxas.
---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  4 ++--
 flang/test/Lower/CUDA/cuda-device-proc.cuf    | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 2db0606d2bc9e..6ebd52dcd42ea 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -9501,7 +9501,7 @@ void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
       builder, loc, dst, src, fir::getBase(args[2]), {}, {});
 
   mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group", {});
+                                  "cp.async.bulk.commit_group;", {});
   mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
                                              builder.getI32IntegerAttr(0), {});
 }
@@ -9517,7 +9517,7 @@ static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
                                                      size, {}, {});
   mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group", {});
+                                  "cp.async.bulk.commit_group;", {});
   mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
                                              builder.getI32IntegerAttr(0), {});
 }
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index ed015df263070..666c394ad6678 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -494,7 +494,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(device) subroutine testAtomicCasLoop(aa, n)
@@ -675,7 +675,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_c8(c, n)
@@ -688,7 +688,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_i4(c, n)
@@ -701,7 +701,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_i8(c, n)
@@ -714,7 +714,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 
@@ -728,7 +728,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_r4(c, n)
@@ -741,7 +741,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_r8(c, n)
@@ -754,5 +754,5 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0

From 285b57b1a68e6738dac047f5f6461f231448b0f5 Mon Sep 17 00:00:00 2001
From: Rafael Auler <rafaelauler@meta.com>
Date: Mon, 3 Nov 2025 15:11:29 -0800
Subject: [PATCH 103/313] Update BOLT's README.md example optimization flag
 (#166251)

Drop hfsort in favor of a more modern function reordering algorithm.
---
 bolt/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/README.md b/bolt/README.md
index 902d1eb6e7694..55f742c5019f5 100644
--- a/bolt/README.md
+++ b/bolt/README.md
@@ -173,7 +173,7 @@ Once you have `perf.fdata` ready, you can use it for optimizations with
 BOLT. Assuming your environment is setup to include the right path, execute
 `llvm-bolt`:
 ```
-$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats
+$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats
 ```
 
 If you do need an updated debug info, then add `-update-debug-sections` option

From d4c41b7fa30be06b5250c0d5abc7a26a83420321 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Tue, 4 Nov 2025 08:34:52 +0900
Subject: [PATCH 104/313] [mlir][Transforms] Dialect Conversion: Convert entry
 block only (#165180)

When converting a function, convert only the entry block signature. The
remaining block signatures should be converted by the respective
branching ops. The `FuncToLLVM` / `ControlFlowToLLVM` patterns already
use that design.

```c++
struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> {

  LogicalResult
  matchAndRewrite(cf::BranchOp op, OneToNOpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    // Convert successor block.
    SmallVector<Value> flattenedAdaptor = flattenValues(adaptor.getOperands());
    FailureOr<Block *> convertedBlock =
        getConvertedBlock(rewriter, getTypeConverter(), op, op.getSuccessor(),
                          TypeRange(ValueRange(flattenedAdaptor)));
    // ...
  }
};
```

This is consistent with the fact that operations from unreachable blocks
are not put on the initial worklist.

With this change, parent ops are no longer recursively legalized when
inserting a block, simplifying the conversion driver a bit.

Note for LLVM integration: If you are seeing failures, make sure to:
- Drop `converter.isLegal(&op.getBody())` when checking the legality of
a function op. Only the entry block signature / function type should be
taken into account.
- If you need to convert all reachable blocks and are using `cf`
branching ops, add `populateCFStructuralTypeConversionsAndLegality`.
- If you need to convert all reachable blocks and are using custom
branching ops, implement and populate custom structural type conversion
patterns, similar to `populateCFStructuralTypeConversionsAndLegality`.
---
 .../Transforms/Utils/DialectConversion.cpp    | 112 ++++--------------
 .../test-legalizer-no-materializations.mlir   |  67 +++++++++++
 mlir/test/Transforms/test-legalizer.mlir      |  39 ------
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |   6 +-
 4 files changed, 89 insertions(+), 135 deletions(-)
 create mode 100644 mlir/test/Transforms/test-legalizer-no-materializations.mlir

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 3a23bbfd70eac..2fe06970eb568 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1105,10 +1105,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// A set of operations that were modified by the current pattern.
   SetVector<Operation *> patternModifiedOps;
 
-  /// A set of blocks that were inserted (newly-created blocks or moved blocks)
-  /// by the current pattern.
-  SetVector<Block *> patternInsertedBlocks;
-
   /// A list of unresolved materializations that were created by the current
   /// pattern.
   DenseSet<UnrealizedConversionCastOp> patternMaterializations;
@@ -2046,8 +2042,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   if (!config.allowPatternRollback && config.listener)
     config.listener->notifyBlockInserted(block, previous, previousIt);
 
-  patternInsertedBlocks.insert(block);
-
   if (wasDetached) {
     // If the block was detached, it is most likely a newly created block.
     if (config.allowPatternRollback) {
@@ -2399,17 +2393,12 @@ class OperationLegalizer {
   bool canApplyPattern(Operation *op, const Pattern &pattern);
 
   /// Legalize the resultant IR after successfully applying the given pattern.
-  LogicalResult legalizePatternResult(Operation *op, const Pattern &pattern,
-                                      const RewriterState &curState,
-                                      const SetVector<Operation *> &newOps,
-                                      const SetVector<Operation *> &modifiedOps,
-                                      const SetVector<Block *> &insertedBlocks);
-
-  /// Legalizes the actions registered during the execution of a pattern.
   LogicalResult
-  legalizePatternBlockRewrites(Operation *op,
-                               const SetVector<Block *> &insertedBlocks,
-                               const SetVector<Operation *> &newOps);
+  legalizePatternResult(Operation *op, const Pattern &pattern,
+                        const RewriterState &curState,
+                        const SetVector<Operation *> &newOps,
+                        const SetVector<Operation *> &modifiedOps);
+
   LogicalResult
   legalizePatternCreatedOperations(const SetVector<Operation *> &newOps);
   LogicalResult
@@ -2608,7 +2597,6 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) {
   auto cleanup = llvm::make_scope_exit([&]() {
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
-    rewriterImpl.patternInsertedBlocks.clear();
   });
 
   // Upon failure, undo all changes made by the folder.
@@ -2662,24 +2650,16 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) {
 static void
 reportNewIrLegalizationFatalError(const Pattern &pattern,
                                   const SetVector<Operation *> &newOps,
-                                  const SetVector<Operation *> &modifiedOps,
-                                  const SetVector<Block *> &insertedBlocks) {
+                                  const SetVector<Operation *> &modifiedOps) {
   auto newOpNames = llvm::map_range(
       newOps, [](Operation *op) { return op->getName().getStringRef(); });
   auto modifiedOpNames = llvm::map_range(
       modifiedOps, [](Operation *op) { return op->getName().getStringRef(); });
-  StringRef detachedBlockStr = "(detached block)";
-  auto insertedBlockNames = llvm::map_range(insertedBlocks, [&](Block *block) {
-    if (block->getParentOp())
-      return block->getParentOp()->getName().getStringRef();
-    return detachedBlockStr;
-  });
-  llvm::report_fatal_error(
-      "pattern '" + pattern.getDebugName() +
-      "' produced IR that could not be legalized. " + "new ops: {" +
-      llvm::join(newOpNames, ", ") + "}, " + "modified ops: {" +
-      llvm::join(modifiedOpNames, ", ") + "}, " + "inserted block into ops: {" +
-      llvm::join(insertedBlockNames, ", ") + "}");
+  llvm::report_fatal_error("pattern '" + pattern.getDebugName() +
+                           "' produced IR that could not be legalized. " +
+                           "new ops: {" + llvm::join(newOpNames, ", ") + "}, " +
+                           "modified ops: {" +
+                           llvm::join(modifiedOpNames, ", ") + "}");
 }
 
 LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
@@ -2743,7 +2723,6 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
     }
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
-    rewriterImpl.patternInsertedBlocks.clear();
     LLVM_DEBUG({
       logFailure(rewriterImpl.logger, "pattern failed to match");
       if (rewriterImpl.config.notifyCallback) {
@@ -2777,15 +2756,12 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
     SetVector<Operation *> newOps = moveAndReset(rewriterImpl.patternNewOps);
     SetVector<Operation *> modifiedOps =
         moveAndReset(rewriterImpl.patternModifiedOps);
-    SetVector<Block *> insertedBlocks =
-        moveAndReset(rewriterImpl.patternInsertedBlocks);
-    auto result = legalizePatternResult(op, pattern, curState, newOps,
-                                        modifiedOps, insertedBlocks);
+    auto result =
+        legalizePatternResult(op, pattern, curState, newOps, modifiedOps);
     appliedPatterns.erase(&pattern);
     if (failed(result)) {
       if (!rewriterImpl.config.allowPatternRollback)
-        reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps,
-                                          insertedBlocks);
+        reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps);
       rewriterImpl.resetState(curState, pattern.getDebugName());
     }
     if (config.listener)
@@ -2823,8 +2799,7 @@ bool OperationLegalizer::canApplyPattern(Operation *op,
 LogicalResult OperationLegalizer::legalizePatternResult(
     Operation *op, const Pattern &pattern, const RewriterState &curState,
     const SetVector<Operation *> &newOps,
-    const SetVector<Operation *> &modifiedOps,
-    const SetVector<Block *> &insertedBlocks) {
+    const SetVector<Operation *> &modifiedOps) {
   [[maybe_unused]] auto &impl = rewriter.getImpl();
   assert(impl.pendingRootUpdates.empty() && "dangling root updates");
 
@@ -2843,8 +2818,7 @@ LogicalResult OperationLegalizer::legalizePatternResult(
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
 
   // Legalize each of the actions registered during application.
-  if (failed(legalizePatternBlockRewrites(op, insertedBlocks, newOps)) ||
-      failed(legalizePatternRootUpdates(modifiedOps)) ||
+  if (failed(legalizePatternRootUpdates(modifiedOps)) ||
       failed(legalizePatternCreatedOperations(newOps))) {
     return failure();
   }
@@ -2853,53 +2827,6 @@ LogicalResult OperationLegalizer::legalizePatternResult(
   return success();
 }
 
-LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
-    Operation *op, const SetVector<Block *> &insertedBlocks,
-    const SetVector<Operation *> &newOps) {
-  ConversionPatternRewriterImpl &impl = rewriter.getImpl();
-  SmallPtrSet<Operation *, 16> alreadyLegalized;
-
-  // If the pattern moved or created any blocks, make sure the types of block
-  // arguments get legalized.
-  for (Block *block : insertedBlocks) {
-    if (impl.erasedBlocks.contains(block))
-      continue;
-
-    // Only check blocks outside of the current operation.
-    Operation *parentOp = block->getParentOp();
-    if (!parentOp || parentOp == op || block->getNumArguments() == 0)
-      continue;
-
-    // If the region of the block has a type converter, try to convert the block
-    // directly.
-    if (auto *converter = impl.regionToConverter.lookup(block->getParent())) {
-      std::optional<TypeConverter::SignatureConversion> conversion =
-          converter->convertBlockSignature(block);
-      if (!conversion) {
-        LLVM_DEBUG(logFailure(impl.logger, "failed to convert types of moved "
-                                           "block"));
-        return failure();
-      }
-      impl.applySignatureConversion(block, converter, *conversion);
-      continue;
-    }
-
-    // Otherwise, try to legalize the parent operation if it was not generated
-    // by this pattern. This is because we will attempt to legalize the parent
-    // operation, and blocks in regions created by this pattern will already be
-    // legalized later on.
-    if (!newOps.count(parentOp) && alreadyLegalized.insert(parentOp).second) {
-      if (failed(legalize(parentOp))) {
-        LLVM_DEBUG(logFailure(
-            impl.logger, "operation '{0}'({1}) became illegal after rewrite",
-            parentOp->getName(), parentOp));
-        return failure();
-      }
-    }
-  }
-  return success();
-}
-
 LogicalResult OperationLegalizer::legalizePatternCreatedOperations(
     const SetVector<Operation *> &newOps) {
   for (Operation *op : newOps) {
@@ -3800,10 +3727,11 @@ static LogicalResult convertFuncOpTypes(FunctionOpInterface funcOp,
   TypeConverter::SignatureConversion result(type.getNumInputs());
   SmallVector<Type, 1> newResults;
   if (failed(typeConverter.convertSignatureArgs(type.getInputs(), result)) ||
-      failed(typeConverter.convertTypes(type.getResults(), newResults)) ||
-      failed(rewriter.convertRegionTypes(&funcOp.getFunctionBody(),
-                                         typeConverter, &result)))
+      failed(typeConverter.convertTypes(type.getResults(), newResults)))
     return failure();
+  if (!funcOp.getFunctionBody().empty())
+    rewriter.applySignatureConversion(&funcOp.getFunctionBody().front(), result,
+                                      &typeConverter);
 
   // Update the function signature in-place.
   auto newType = FunctionType::get(rewriter.getContext(),
diff --git a/mlir/test/Transforms/test-legalizer-no-materializations.mlir b/mlir/test/Transforms/test-legalizer-no-materializations.mlir
new file mode 100644
index 0000000000000..82dd7422b22b2
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-no-materializations.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND
+
+// CHECK-LABEL: func @dropped_input_in_use
+// CHECK-KIND-LABEL: func @dropped_input_in_use
+func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
+  // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16
+  // CHECK-NEXT: "work"(%[[cast]]) : (i16)
+  // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"}
+  // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16)
+  // expected-remark@+1 {{op 'work' is not legalizable}}
+  "work"(%arg) : (i16) -> ()
+}
+
+// -----
+
+// CHECK-KIND-LABEL: func @test_lookup_without_converter
+//       CHECK-KIND:   %[[producer:.*]] = "test.valid_producer"() : () -> i16
+//       CHECK-KIND:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"}
+//       CHECK-KIND:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
+//       CHECK-KIND:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
+func.func @test_lookup_without_converter() {
+  %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
+  "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
+  // Make sure that the second "replace_with_valid_consumer" lowering does not
+  // lookup the materialization that was created for the above op.
+  "test.replace_with_valid_consumer"(%0) : (i64) -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @remap_moved_region_args
+func.func @remap_moved_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32):
+  // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16
+  // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64
+  // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64
+  // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) : () -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @remap_cloned_region_args
+func.func @remap_cloned_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32):
+  // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16
+  // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64
+  // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64
+  // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) {legalizer.should_clone} : () -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 94c5bb4e93b06..7c43bb7bface0 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -1,7 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND
 
 // CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B"
 // CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B"
@@ -146,36 +145,6 @@ func.func @no_remap_nested() {
 
 // -----
 
-// CHECK-LABEL: func @remap_moved_region_args
-func.func @remap_moved_region_args() {
-  // CHECK-NEXT: return
-  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
-  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
-  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
-  "test.region"() ({
-    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
-      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
-  }) : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @remap_cloned_region_args
-func.func @remap_cloned_region_args() {
-  // CHECK-NEXT: return
-  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
-  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
-  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
-  "test.region"() ({
-    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
-      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
-  }) {legalizer.should_clone} : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
-  return
-}
-
 // CHECK-LABEL: func @remap_drop_region
 func.func @remap_drop_region() {
   // CHECK-NEXT: return
@@ -191,12 +160,9 @@ func.func @remap_drop_region() {
 // -----
 
 // CHECK-LABEL: func @dropped_input_in_use
-// CHECK-KIND-LABEL: func @dropped_input_in_use
 func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
   // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16
   // CHECK-NEXT: "work"(%[[cast]]) : (i16)
-  // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"}
-  // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16)
   // expected-remark@+1 {{op 'work' is not legalizable}}
   "work"(%arg) : (i16) -> ()
 }
@@ -452,11 +418,6 @@ func.func @test_multiple_1_to_n_replacement() {
 //       CHECK:   %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64
 //       CHECK:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
 //       CHECK:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
-// CHECK-KIND-LABEL: func @test_lookup_without_converter
-//       CHECK-KIND:   %[[producer:.*]] = "test.valid_producer"() : () -> i16
-//       CHECK-KIND:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"}
-//       CHECK-KIND:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
-//       CHECK-KIND:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
 func.func @test_lookup_without_converter() {
   %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
   "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index fd2b943ff1296..12edecc113495 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1553,8 +1553,7 @@ struct TestLegalizePatternDriver
                            [](Type type) { return type.isF32(); });
     });
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
+      return converter.isSignatureLegal(op.getFunctionType());
     });
     target.addDynamicallyLegalOp<func::CallOp>(
         [&](func::CallOp op) { return converter.isLegal(op); });
@@ -2156,8 +2155,7 @@ struct TestTypeConversionDriver
               recursiveType.getName() == "outer_converted_type");
     });
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
+      return converter.isSignatureLegal(op.getFunctionType());
     });
     target.addDynamicallyLegalOp<TestCastOp>([&](TestCastOp op) {
       // Allow casts from F64 to F32.

From 590a2b0a1f3250143d0c5c8c6ab02cba2fcd46ba Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 16:00:29 -0800
Subject: [PATCH 105/313] Revert "ARM: Remove unnecessary manual ABI lowering
 for sincos_stret (#166040)" (#166262)

This reverts commit a522ae3ef6e13cb39e7756c151652e03a024b301.

The ABI handling doesn't account for matching the C ABI, only implicit
sret.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 39 +++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3a00267395504..6b0653457cbaf 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -9869,12 +9869,32 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin());
 
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
 
   // Pair of floats / doubles used to pass the result.
   Type *RetTy = StructType::get(ArgTy, ArgTy);
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
+  bool ShouldUseSRet = getTM().isAPCS_ABI();
+  SDValue SRet;
+  if (ShouldUseSRet) {
+    // Create stack object for sret.
+    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
+    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
+
+    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsSRet = true;
+    Args.push_back(Entry);
+    RetTy = Type::getVoidTy(*DAG.getContext());
+  }
+
   Args.emplace_back(Arg, ArgTy);
 
   StringRef LibcallName = getLibcallImplName(SincosStret);
@@ -9884,10 +9904,25 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args));
+      .setCallee(CC, RetTy, Callee, std::move(Args))
+      .setDiscardResult(ShouldUseSRet);
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
-  return CallResult.first;
+  if (!ShouldUseSRet)
+    return CallResult.first;
+
+  SDValue LoadSin =
+      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
+  SDValue LoadCos =
+      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
 }
 
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,

From c77b614564c69c7e63363859bf9b37427614eabd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 16:00:54 -0800
Subject: [PATCH 106/313] ARM: Add more ABIs to llvm.sincos test (#166264)

Make sure the iOS with/without sincos_stret are tested
---
 llvm/test/CodeGen/ARM/llvm.sincos.ll | 910 +++++++++++++++++++++------
 1 file changed, 735 insertions(+), 175 deletions(-)

diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll
index 9628405df6bcb..0c2263ee9acbf 100644
--- a/llvm/test/CodeGen/ARM/llvm.sincos.ll
+++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll
@@ -1,223 +1,783 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s
+; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s
+; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s
+; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s
 
 define { half, half } @test_sincos_f16(half %a) {
-; CHECK-LABEL: test_sincos_f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r1, r0
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r1, r0
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r4}
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r5, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
   %result = call { half, half } @llvm.sincos.f16(half %a)
   ret { half, half } %result
 }
 
 define half @test_sincos_f16_only_use_sin(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_sin:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_sin:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_sin:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
 }
 
 define half @test_sincos_f16_only_use_cos(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_cos:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_cos:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_cos:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #4]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
 }
 
 define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) {
-; CHECK-LABEL: test_sincos_v2f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #12]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp, #4]
-; CHECK-NEXT:    strh.w r0, [sp, #22]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #20]
-; CHECK-NEXT:    add r0, sp, #20
-; CHECK-NEXT:    vld1.32 {d8[0]}, [r0:32]
-; CHECK-NEXT:    ldr r0, [sp, #8]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp]
-; CHECK-NEXT:    strh.w r0, [sp, #18]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #16]
-; CHECK-NEXT:    add r0, sp, #16
-; CHECK-NEXT:    vmovl.u16 q9, d8
-; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
-; CHECK-NEXT:    vmovl.u16 q8, d16
-; CHECK-NEXT:    vmov.32 r0, d18[0]
-; CHECK-NEXT:    vmov.32 r1, d18[1]
-; CHECK-NEXT:    vmov.32 r2, d16[0]
-; CHECK-NEXT:    vmov.32 r3, d16[1]
-; CHECK-NEXT:    add sp, #24
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #24
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #12]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp, #4]
+; GNU-NEXT:    strh.w r0, [sp, #22]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #20]
+; GNU-NEXT:    add r0, sp, #20
+; GNU-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNU-NEXT:    ldr r0, [sp, #8]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp]
+; GNU-NEXT:    strh.w r0, [sp, #18]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #16]
+; GNU-NEXT:    add r0, sp, #16
+; GNU-NEXT:    vmovl.u16 q9, d8
+; GNU-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNU-NEXT:    vmovl.u16 q8, d16
+; GNU-NEXT:    vmov.32 r0, d18[0]
+; GNU-NEXT:    vmov.32 r1, d18[1]
+; GNU-NEXT:    vmov.32 r2, d16[0]
+; GNU-NEXT:    vmov.32 r3, d16[1]
+; GNU-NEXT:    add sp, #24
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #24
+; GNUEABI-NEXT:    sub sp, sp, #24
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #12]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp, #4]
+; GNUEABI-NEXT:    strh r0, [sp, #22]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #20]
+; GNUEABI-NEXT:    add r0, sp, #20
+; GNUEABI-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp]
+; GNUEABI-NEXT:    strh r0, [sp, #18]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #16]
+; GNUEABI-NEXT:    add r0, sp, #16
+; GNUEABI-NEXT:    vmovl.u16 q9, d8
+; GNUEABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNUEABI-NEXT:    vmovl.u16 q8, d16
+; GNUEABI-NEXT:    vmov.32 r0, d18[0]
+; GNUEABI-NEXT:    vmov.32 r1, d18[1]
+; GNUEABI-NEXT:    vmov.32 r2, d16[0]
+; GNUEABI-NEXT:    vmov.32 r3, d16[1]
+; GNUEABI-NEXT:    add sp, sp, #24
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    sub sp, sp, #8
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r1
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #6]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #4]
+; IOS-NO-STRET-NEXT:    add r0, sp, #4
+; IOS-NO-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #2]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp]
+; IOS-NO-STRET-NEXT:    mov r0, sp
+; IOS-NO-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-NO-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-NO-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-NO-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-NO-STRET-NEXT:    add sp, sp, #8
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #24
+; IOS-WITH-STRET-NEXT:    mov r4, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldr r4, [sp, #12]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    ldm sp, {r1, r5}
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #22]
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #20]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #20
+; IOS-WITH-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #18]
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #16]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-WITH-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-WITH-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-WITH-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #24
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
   %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
 
 define { float, float } @test_sincos_f32(float %a) {
-; CHECK-LABEL: test_sincos_f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldrd r1, r0, [sp], #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldrd r1, r0, [sp], #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    ldr r1, [sp], #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    pop {r0, r1}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
   %result = call { float, float } @llvm.sincos.f32(float %a)
   ret { float, float } %result
 }
 
 define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_sincos_v2f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vldr s1, [sp, #4]
-; CHECK-NEXT:    vldr s3, [sp]
-; CHECK-NEXT:    vldr s0, [sp, #12]
-; CHECK-NEXT:    vldr s2, [sp, #8]
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_v2f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    vmov d8, r0, r1
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    vmov r0, s17
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vmov r0, s16
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vldr s1, [sp, #4]
+; GNU-NEXT:    vldr s3, [sp]
+; GNU-NEXT:    vldr s0, [sp, #12]
+; GNU-NEXT:    vldr s2, [sp, #8]
+; GNU-NEXT:    vmov r0, r1, d0
+; GNU-NEXT:    vmov r2, r3, d1
+; GNU-NEXT:    add sp, #16
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    vmov d8, r0, r1
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    vmov r0, s17
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vmov r0, s16
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vldr s1, [sp, #4]
+; GNUEABI-NEXT:    vldr s3, [sp]
+; GNUEABI-NEXT:    vldr s0, [sp, #12]
+; GNUEABI-NEXT:    vldr s2, [sp, #8]
+; GNUEABI-NEXT:    vmov r0, r1, d0
+; GNUEABI-NEXT:    vmov r2, r3, d1
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vmov r4, s17
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    vmov r6, s16
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r7, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r5
+; IOS-NO-STRET-NEXT:    mov r3, r4
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vmov d8, r0, r1
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    vmov r1, s17
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vmov r1, s16
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vldr s1, [sp]
+; IOS-WITH-STRET-NEXT:    vldr s3, [sp, #4]
+; IOS-WITH-STRET-NEXT:    vldr s0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vldr s2, [sp, #12]
+; IOS-WITH-STRET-NEXT:    vmov r0, r1, d0
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d1
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
 
 define { double, double } @test_sincos_f64(double %a) {
-; CHECK-LABEL: test_sincos_f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #8]
-; CHECK-NEXT:    ldrd r2, r3, [sp], #16
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #8]
+; GNU-NEXT:    ldrd r2, r3, [sp], #16
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldm sp, {r2, r3}
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    ldr r1, [sp, #12]
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r1
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r6, r0
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    mov r1, r4
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r3, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r7
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r2, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r1}
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d16
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
   %result = call { double, double } @llvm.sincos.f64(double %a)
   ret { double, double } %result
 }
 
 define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    mov r1, r3
-; CHECK-NEXT:    mov r12, r2
-; CHECK-NEXT:    add r2, sp, #24
-; CHECK-NEXT:    add r3, sp, #16
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #40]
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    vldr d19, [sp, #8]
-; CHECK-NEXT:    vldr d18, [sp, #24]
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vldr d16, [sp, #16]
-; CHECK-NEXT:    vst1.64 {d18, d19}, [r4]!
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r4]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #32
+; GNU-NEXT:    mov r1, r3
+; GNU-NEXT:    mov r12, r2
+; GNU-NEXT:    add r2, sp, #24
+; GNU-NEXT:    add r3, sp, #16
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r12
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #40]
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    vldr d19, [sp, #8]
+; GNU-NEXT:    vldr d18, [sp, #24]
+; GNU-NEXT:    vldr d17, [sp]
+; GNU-NEXT:    vldr d16, [sp, #16]
+; GNU-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNU-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNU-NEXT:    add sp, #32
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #32
+; GNUEABI-NEXT:    sub sp, sp, #32
+; GNUEABI-NEXT:    mov r1, r3
+; GNUEABI-NEXT:    mov r12, r2
+; GNUEABI-NEXT:    add r2, sp, #24
+; GNUEABI-NEXT:    add r3, sp, #16
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r12
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldr r0, [sp, #40]
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    ldr r1, [sp, #44]
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    vldr d19, [sp, #8]
+; GNUEABI-NEXT:    vldr d18, [sp, #24]
+; GNUEABI-NEXT:    vldr d17, [sp]
+; GNUEABI-NEXT:    vldr d16, [sp, #16]
+; GNUEABI-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNUEABI-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNUEABI-NEXT:    add sp, sp, #32
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, r8, r10, r11, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    ldr r8, [sp, #64]
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r3
+; IOS-NO-STRET-NEXT:    mov r6, r3
+; IOS-NO-STRET-NEXT:    mov r10, r2
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r11, r0
+; IOS-NO-STRET-NEXT:    mov r5, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d9, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    vmov d11, r11, r5
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    vmov d10, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vst1.32 {d10, d11}, [r4]!
+; IOS-NO-STRET-NEXT:    vst1.32 {d8, d9}, [r4]
+; IOS-NO-STRET-NEXT:    vpop {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, r8, r10, r11, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, r6, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #32
+; IOS-WITH-STRET-NEXT:    mov r4, r2
+; IOS-WITH-STRET-NEXT:    ldr r2, [sp, #48]
+; IOS-WITH-STRET-NEXT:    mov r6, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r5, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r3
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    mov r1, r5
+; IOS-WITH-STRET-NEXT:    mov r2, r4
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d17, [sp, #16]
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp]
+; IOS-WITH-STRET-NEXT:    vldr d19, [sp, #24]
+; IOS-WITH-STRET-NEXT:    vldr d18, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vst1.32 {d16, d17}, [r6]!
+; IOS-WITH-STRET-NEXT:    vst1.32 {d18, d19}, [r6]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #32
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, r6, pc}
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
 
 define { fp128, fp128 } @test_sincos_f128(fp128 %a) {
-; CHECK-LABEL: test_sincos_f128:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    mov r12, r3
-; CHECK-NEXT:    ldr r3, [sp, #56]
-; CHECK-NEXT:    add.w lr, sp, #8
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add r0, sp, #24
-; CHECK-NEXT:    strd r0, lr, [sp]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    bl sincosl
-; CHECK-NEXT:    ldrd r2, r3, [sp, #16]
-; CHECK-NEXT:    ldrd r12, r1, [sp, #8]
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    ldrd r3, r5, [sp, #32]
-; CHECK-NEXT:    ldrd lr, r0, [sp, #24]
-; CHECK-NEXT:    strd r1, r2, [r4, #20]
-; CHECK-NEXT:    add.w r1, r4, #8
-; CHECK-NEXT:    stm.w r1, {r3, r5, r12}
-; CHECK-NEXT:    strd lr, r0, [r4]
-; CHECK-NEXT:    add sp, #40
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; GNU-LABEL: test_sincos_f128:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, r5, r7, lr}
+; GNU-NEXT:    sub sp, #40
+; GNU-NEXT:    mov r12, r3
+; GNU-NEXT:    ldr r3, [sp, #56]
+; GNU-NEXT:    add.w lr, sp, #8
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    add r0, sp, #24
+; GNU-NEXT:    strd r0, lr, [sp]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    mov r1, r2
+; GNU-NEXT:    mov r2, r12
+; GNU-NEXT:    bl sincosl
+; GNU-NEXT:    ldrd r2, r3, [sp, #16]
+; GNU-NEXT:    ldrd r12, r1, [sp, #8]
+; GNU-NEXT:    str r3, [r4, #28]
+; GNU-NEXT:    ldrd r3, r5, [sp, #32]
+; GNU-NEXT:    ldrd lr, r0, [sp, #24]
+; GNU-NEXT:    strd r1, r2, [r4, #20]
+; GNU-NEXT:    add.w r1, r4, #8
+; GNU-NEXT:    stm.w r1, {r3, r5, r12}
+; GNU-NEXT:    strd lr, r0, [r4]
+; GNU-NEXT:    add sp, #40
+; GNU-NEXT:    pop {r4, r5, r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f128:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, r5, r11, lr}
+; GNUEABI-NEXT:    push {r4, r5, r11, lr}
+; GNUEABI-NEXT:    .pad #40
+; GNUEABI-NEXT:    sub sp, sp, #40
+; GNUEABI-NEXT:    mov r12, r3
+; GNUEABI-NEXT:    ldr r3, [sp, #56]
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    add r0, sp, #24
+; GNUEABI-NEXT:    add r5, sp, #8
+; GNUEABI-NEXT:    stm sp, {r0, r5}
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    mov r1, r2
+; GNUEABI-NEXT:    mov r2, r12
+; GNUEABI-NEXT:    bl sincosl
+; GNUEABI-NEXT:    add r3, sp, #12
+; GNUEABI-NEXT:    ldr r12, [sp, #8]
+; GNUEABI-NEXT:    ldm r3, {r1, r2, r3}
+; GNUEABI-NEXT:    str r3, [r4, #28]
+; GNUEABI-NEXT:    ldr r0, [sp, #32]
+; GNUEABI-NEXT:    ldr lr, [sp, #24]
+; GNUEABI-NEXT:    ldr r5, [sp, #28]
+; GNUEABI-NEXT:    ldr r3, [sp, #36]
+; GNUEABI-NEXT:    str r2, [r4, #24]
+; GNUEABI-NEXT:    str r1, [r4, #20]
+; GNUEABI-NEXT:    add r1, r4, #8
+; GNUEABI-NEXT:    stm r1, {r0, r3, r12}
+; GNUEABI-NEXT:    str r5, [r4, #4]
+; GNUEABI-NEXT:    str lr, [r4]
+; GNUEABI-NEXT:    add sp, sp, #40
+; GNUEABI-NEXT:    pop {r4, r5, r11, pc}
+;
+; IOS-LABEL: test_sincos_f128:
+; IOS:       @ %bb.0:
+; IOS-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; IOS-NEXT:    ldr r8, [sp, #24]
+; IOS-NEXT:    mov r4, r0
+; IOS-NEXT:    mov r5, r3
+; IOS-NEXT:    mov r6, r2
+; IOS-NEXT:    mov r7, r1
+; IOS-NEXT:    mov r0, r1
+; IOS-NEXT:    mov r1, r2
+; IOS-NEXT:    mov r2, r3
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _cosl
+; IOS-NEXT:    add r9, r4, #16
+; IOS-NEXT:    stm r9, {r0, r1, r2, r3}
+; IOS-NEXT:    mov r0, r7
+; IOS-NEXT:    mov r1, r6
+; IOS-NEXT:    mov r2, r5
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _sinl
+; IOS-NEXT:    stm r4, {r0, r1, r2, r3}
+; IOS-NEXT:    pop {r4, r5, r6, r7, r8, pc}
   %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a)
   ret { fp128, fp128 } %result
 }

From f7fff18ad09680056f028a99a961d4120063c55b Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Tue, 4 Nov 2025 01:02:36 +0100
Subject: [PATCH 107/313] [mlir][OpenACC] add unstructured attributes for
 acc.loop with early exits (#164990)

"!$acc loop" directive may be placed above loops with early exits.

Currently flang lowers loop with early exits to explicit control flow
(this may be revisited when MLIR allows early exits in structured
region). The acc loop directive cannot simply be ignored in such case in
lowering because it may hold data clauses that should be applied when
reaching that point.

This patch adds an "unstructured" attribute to acc.loop to support that
case.
An acc.loop with such attributes may hold data operands but must have no
controls. It is expected that the loop logic is implemented in its body
in a way that the acc dialect may not understand.

Such acc.loop is just a container and the loop with early exit will be
executed sequentially.
---
 mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td |  9 ++++++++-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp         |  8 ++++++--
 mlir/test/Dialect/OpenACC/invalid.mlir          |  9 +++++++++
 mlir/test/Dialect/OpenACC/ops.mlir              | 14 ++++++++++++++
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2f4517ddfe754..c689b7e46ea9e 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2557,6 +2557,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     device-type-aware getter methods. When modifying these operands, the
     corresponding `device_type` attributes must be updated to maintain
     consistency between operands and their target device types.
+
+    The `unstructured` attribute indicates that the loops inside the OpenACC
+    construct contain early exits and cannot be lowered to structured MLIR
+    operations. When this flag is set, the acc.loop should have no induction
+    variables and the loop must be implemented via explicit control flow
+    inside its body.
   }];
 
   let arguments = (ins
@@ -2590,7 +2596,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizationRecipes,
       Variadic<AnyType>:$reductionOperands,
       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-      OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined
+      OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined,
+      UnitAttr:$unstructured
   );
 
   let results = (outs Variadic<AnyType>:$results);
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 35eba724a9059..b2f1d840f3bca 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -3068,8 +3068,12 @@ LogicalResult acc::LoopOp::verify() {
   if (getRegion().empty())
     return emitError("expected non-empty body.");
 
-  // When it is container-like - it is expected to hold a loop-like operation.
-  if (isContainerLike()) {
+  if (getUnstructured()) {
+    if (!isContainerLike())
+      return emitError(
+          "unstructured acc.loop must not have induction variables");
+  } else if (isContainerLike()) {
+    // When it is container-like - it is expected to hold a loop-like operation.
     // Obtain the maximum collapse count - we use this to check that there
     // are enough loops contained.
     uint64_t collapseCount = getCollapseValue().value_or(1);
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index 26b63fbe182ea..0e75894eaeceb 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -492,6 +492,15 @@ func.func @fct1(%0 : !llvm.ptr) -> () {
 
 // -----
 
+%i1 = arith.constant 1 : i32
+%i2 = arith.constant 10 : i32
+// expected-error@+1 {{unstructured acc.loop must not have induction variables}}
+acc.loop control(%iv : i32) = (%i1 : i32) to (%i2 : i32) step (%i1 : i32) {
+  acc.yield
+} attributes {independent = [#acc.device_type<none>], unstructured}
+
+// -----
+
 // expected-error@+1 {{expect at least one of num, dim or static values}}
 acc.loop gang({}) {
   "test.openacc_dummy_op"() : () -> ()
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 042ee2503cb95..df8ab9b7dd239 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -2143,6 +2143,20 @@ func.func @acc_loop_container() {
 
 // -----
 
+func.func @acc_unstructured_loop() {
+  acc.loop {
+    acc.yield
+  } attributes {independent = [#acc.device_type<none>], unstructured}
+  return
+}
+
+// CHECK-LABEL: func.func @acc_unstructured_loop
+// CHECK:       acc.loop
+// CHECK:         acc.yield
+// CHECK:       } attributes {independent = [#acc.device_type<none>], unstructured}
+
+// -----
+
 // Test private recipe with data bounds for array slicing
 acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init {
 ^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty):

From 02d93f7abbd555d93eae94c670fa0aba95758f00 Mon Sep 17 00:00:00 2001
From: barsolo2000 <barsolo@meta.com>
Date: Mon, 3 Nov 2025 17:22:35 -0800
Subject: [PATCH 108/313] [RISCV] Adding `vlenb` register as callee register
 (#165796)

In recent debug sessions we noticed that GDB debugger is showing more
stack trace than lldb.
After enabling unwinding log, it seems like the issue is that the CFA is
dependent on the value of vlenb.
vlenb doesn't change in runtime so we can assume its value from frame 0.

Co-authored-by: Bar Soloveychik <barsolo@fb.com>
---
 lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
index ff37b48d86ca8..a5547a4699ca9 100644
--- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
+++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
@@ -798,6 +798,8 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
           .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"},
                  is_hw_fp)
           .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp)
+          // vlenb is constant and needed for vector unwinding.
+          .Case("vlenb", true)
           .Default(false);
 
   return is_callee_saved;

From a22d1c2225543aa9ae7882f6b1a97ee7b2c95574 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 4 Nov 2025 01:31:18 +0000
Subject: [PATCH 109/313] Revert "[Polly] Introduce PhaseManager and remove LPM
 support (#125442)"

This reverts commit e987ab11a6f3d3965ef26fc42c82db3e8b1d56f5.

This broke premerge:
1. https://lab.llvm.org/staging/#/builders/192/builds/9521
2. https://github.com/llvm/llvm-project/actions/runs/19054182009

Notably this did not break inside the PR. Not exactly sure why. I realize that
there is a lot of test churn here, but they're largely in polly where commit
frequency is much lower, so a reapply of the patch should be clean.
---
 polly/docs/ReleaseNotes.rst                   |   4 -
 polly/include/polly/Canonicalization.h        |   8 +
 polly/include/polly/CodeGen/CodeGeneration.h  |   3 -
 polly/include/polly/CodeGen/IslAst.h          |  36 +-
 polly/include/polly/CodePreparation.h         |   6 -
 polly/include/polly/DeLICM.h                  |  11 +-
 polly/include/polly/DeadCodeElimination.h     |  13 +-
 polly/include/polly/DependenceInfo.h          | 106 ++++-
 polly/include/polly/FlattenSchedule.h         |  16 +-
 polly/include/polly/ForwardOpTree.h           |  20 +-
 polly/include/polly/JSONExporter.h            |  13 +-
 polly/include/polly/LinkAllPasses.h           | 156 +++++++
 polly/include/polly/MaximalStaticExpansion.h  |   2 -
 polly/include/polly/Pass/PhaseManager.h       | 127 ------
 polly/include/polly/Pass/PollyFunctionPass.h  |  32 --
 polly/include/polly/Pass/PollyModulePass.h    |  30 --
 polly/include/polly/PruneUnprofitable.h       |  12 +-
 polly/include/polly/RegisterPasses.h          |   2 +
 polly/include/polly/ScheduleOptimizer.h       |  16 +-
 polly/include/polly/ScopDetection.h           |  27 ++
 polly/include/polly/ScopGraphPrinter.h        |   3 -
 polly/include/polly/ScopInfo.h                |  76 ++++
 polly/include/polly/ScopInliner.h             |   6 +
 polly/include/polly/ScopPass.h                |  28 ++
 polly/include/polly/Simplify.h                |  23 +-
 .../include/polly/Support/DumpFunctionPass.h  |  12 +
 polly/include/polly/Support/DumpModulePass.h  |  15 +-
 polly/include/polly/Support/ScopHelper.h      |   8 +
 polly/lib/Analysis/DependenceInfo.cpp         | 220 ++++++++-
 polly/lib/Analysis/PruneUnprofitable.cpp      |  32 +-
 polly/lib/Analysis/ScopBuilder.cpp            |   1 -
 polly/lib/Analysis/ScopDetection.cpp          | 107 +++++
 polly/lib/Analysis/ScopGraphPrinter.cpp       | 120 ++++-
 polly/lib/Analysis/ScopInfo.cpp               | 228 ++++++++++
 polly/lib/Analysis/ScopPass.cpp               |  36 ++
 polly/lib/CMakeLists.txt                      |   3 -
 polly/lib/CodeGen/CodeGeneration.cpp          |  72 ++-
 polly/lib/CodeGen/IslAst.cpp                  | 101 ++++-
 polly/lib/Exchange/JSONExporter.cpp           | 160 ++++++-
 polly/lib/Pass/PhaseManager.cpp               | 424 ------------------
 polly/lib/Pass/PollyFunctionPass.cpp          |  22 -
 polly/lib/Pass/PollyModulePass.cpp            |  29 --
 polly/lib/Support/DumpFunctionPass.cpp        |  41 ++
 polly/lib/Support/DumpModulePass.cpp          |  47 ++
 polly/lib/Support/PollyPasses.def             |  25 +-
 polly/lib/Support/RegisterPasses.cpp          | 401 ++++++-----------
 polly/lib/Support/ScopHelper.cpp              |  12 +
 polly/lib/Transform/Canonicalization.cpp      |  65 +++
 polly/lib/Transform/CodePreparation.cpp       |  69 +++
 polly/lib/Transform/DeLICM.cpp                | 125 ++++--
 polly/lib/Transform/DeadCodeElimination.cpp   |  44 +-
 polly/lib/Transform/FlattenSchedule.cpp       | 139 ++++--
 polly/lib/Transform/ForwardOpTree.cpp         | 131 ++++--
 .../lib/Transform/MaximalStaticExpansion.cpp  |  81 +++-
 polly/lib/Transform/ScheduleOptimizer.cpp     | 152 ++++++-
 polly/lib/Transform/ScopInliner.cpp           |  46 ++
 polly/lib/Transform/Simplify.cpp              | 101 ++++-
 polly/test/CodeGen/20100617.ll                |   2 +-
 polly/test/CodeGen/20100622.ll                |   4 +-
 polly/test/CodeGen/20100707.ll                |   2 +-
 polly/test/CodeGen/20100707_2.ll              |   2 +-
 polly/test/CodeGen/20100708.ll                |   2 +-
 polly/test/CodeGen/20100708_2.ll              |   2 +-
 polly/test/CodeGen/20100713.ll                |   2 +-
 polly/test/CodeGen/20100713_2.ll              |   2 +-
 polly/test/CodeGen/20100717.ll                |   2 +-
 polly/test/CodeGen/20100718-DomInfo-2.ll      |   2 +-
 polly/test/CodeGen/20100718-DomInfo.ll        |   2 +-
 .../CodeGen/20100720-MultipleConditions.ll    |   2 +-
 .../test/CodeGen/20100809-IndependentBlock.ll |   2 +-
 ...0100811-ScalarDependencyBetweenBrAndCnd.ll |   2 +-
 polly/test/CodeGen/20101030-Overflow.ll       |   2 +-
 polly/test/CodeGen/20101103-Overflow3.ll      |   2 +-
 polly/test/CodeGen/20101103-signmissmatch.ll  |   2 +-
 .../test/CodeGen/20110226-Ignore-Dead-Code.ll |   2 +-
 .../test/CodeGen/20110226-PHI-Node-removed.ll |   2 +-
 polly/test/CodeGen/20120316-InvalidCast.ll    |   2 +-
 .../CodeGen/20120403-RHS-type-mismatch.ll     |   2 +-
 polly/test/CodeGen/20130221.ll                |   2 +-
 .../20150328-SCEVExpanderIntroducesNewIV.ll   |   2 +-
 polly/test/CodeGen/Intrinsics/llvm-expect.ll  |   2 +-
 .../do_not_mutate_debug_info.ll               |   2 +-
 .../loop_nest_param_parallel.ll               |   2 +-
 .../single_loop_param_parallel.ll             |   4 +-
 polly/test/CodeGen/MemAccess/bad_alignment.ll |   2 +-
 .../MemAccess/codegen_address_space.ll        |   2 +-
 .../MemAccess/codegen_constant_offset.ll      |   2 +-
 .../test/CodeGen/MemAccess/codegen_simple.ll  |   2 +-
 .../CodeGen/MemAccess/codegen_simple_float.ll |   2 +-
 .../CodeGen/MemAccess/codegen_simple_md.ll    |   4 +-
 .../MemAccess/codegen_simple_md_float.ll      |   4 +-
 polly/test/CodeGen/MemAccess/create_arrays.ll |   4 +-
 .../CodeGen/MemAccess/create_arrays_heap.ll   |   4 +-
 .../default_aligned_new_access_function.ll    |   2 +-
 .../test/CodeGen/MemAccess/different_types.ll |   4 +-
 polly/test/CodeGen/MemAccess/generate-all.ll  |   6 +-
 .../CodeGen/MemAccess/invariant_base_ptr.ll   |   4 +-
 .../CodeGen/MemAccess/map_scalar_access.ll    |   4 +-
 .../test/CodeGen/MemAccess/multiple_types.ll  |   4 +-
 polly/test/CodeGen/MemAccess/simple.ll        |   2 +-
 .../test/CodeGen/MemAccess/simple_analyze.ll  |   2 +-
 .../MemAccess/update_access_functions.ll      |   4 +-
 .../CodeGen/Metadata/basic_vec_annotate.ll    |   2 +-
 polly/test/CodeGen/OpenMP/alias-metadata.ll   |   2 +-
 .../floord-as-argument-to-subfunction.ll      |   2 +-
 polly/test/CodeGen/OpenMP/inlineasm.ll        |   2 +-
 .../invariant_base_pointer_preloaded.ll       |   3 +-
 ...ant_base_pointer_preloaded_different_bb.ll |   3 +-
 ...base_pointer_preloaded_pass_only_needed.ll |   3 +-
 .../invariant_base_pointers_preloaded.ll      |   3 +-
 .../OpenMP/loop-body-references-outer-iv.ll   |   4 +-
 .../loop-body-references-outer-values-2.ll    |   4 +-
 .../loop-body-references-outer-values-3.ll    |   4 +-
 .../loop-body-references-outer-values.ll      |   4 +-
 .../OpenMP/loop-bounds-reference-outer-ids.ll |   4 +-
 .../test/CodeGen/OpenMP/mapped-phi-access.ll  |   2 +-
 polly/test/CodeGen/OpenMP/matmul-parallel.ll  |   4 +-
 .../CodeGen/OpenMP/new_multidim_access.ll     |   8 +-
 polly/test/CodeGen/OpenMP/recomputed-srem.ll  |   3 +-
 ...ference-argument-from-non-affine-region.ll |  19 +-
 .../test/CodeGen/OpenMP/reference-other-bb.ll |   2 +-
 .../OpenMP/reference-preceeding-loop.ll       |   4 +-
 polly/test/CodeGen/OpenMP/reference_latest.ll |   2 +-
 polly/test/CodeGen/OpenMP/scev-rewriting.ll   |   2 +-
 polly/test/CodeGen/OpenMP/single_loop.ll      |  18 +-
 ...single_loop_with_loop_invariant_baseptr.ll |   4 +-
 .../CodeGen/OpenMP/single_loop_with_param.ll  |  16 +-
 ...o-parallel-loops-reference-outer-indvar.ll |   4 +-
 polly/test/CodeGen/PHIInExit.ll               |   2 +-
 .../combine_different_values.ll               |   4 +-
 .../RuntimeDebugBuilder/stmt_tracing.ll       |   2 +-
 polly/test/CodeGen/alias-check-multi-dim.ll   |   3 +-
 .../CodeGen/alias_metadata_too_many_arrays.ll |   3 +-
 ...aliasing_different_base_and_access_type.ll |   2 +-
 .../aliasing_different_pointer_types.ll       |   2 +-
 .../aliasing_multidimensional_access.ll       |   2 +-
 .../CodeGen/aliasing_parametric_simple_1.ll   |   2 +-
 .../CodeGen/aliasing_parametric_simple_2.ll   |   2 +-
 polly/test/CodeGen/aliasing_struct_element.ll |   2 +-
 polly/test/CodeGen/alignment.ll               |   2 +-
 polly/test/CodeGen/annotated_alias_scopes.ll  |   2 +-
 polly/test/CodeGen/blas_sscal_simplified.ll   |   2 +-
 ...code-hosting-and-escape-map-computation.ll |   2 +-
 polly/test/CodeGen/constant_condition.ll      |   2 +-
 polly/test/CodeGen/create-conditional-scop.ll |   2 +-
 ...d_instruction_referenced_by_parameter_1.ll |   2 +-
 ...d_instruction_referenced_by_parameter_2.ll |   2 +-
 polly/test/CodeGen/debug-intrinsics.ll        |   8 +-
 ...nce_problem_after_early_codegen_bailout.ll |   2 +-
 polly/test/CodeGen/empty_domain_in_context.ll |   2 +-
 polly/test/CodeGen/entry_with_trivial_phi.ll  |   2 +-
 .../entry_with_trivial_phi_other_bb.ll        |   2 +-
 .../error-stmt-in-non-affine-region.ll        |   2 +-
 ...or_block_contains_invalid_memory_access.ll |   2 +-
 polly/test/CodeGen/exprModDiv.ll              |   7 +-
 .../hoisted_load_escapes_through_phi.ll       |   6 +-
 polly/test/CodeGen/hoisting_1.ll              |   2 +-
 polly/test/CodeGen/hoisting_2.ll              |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_1.ll       |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_2.ll       |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_3.ll       |   2 +-
 polly/test/CodeGen/inner_scev_sdiv_in_lb.ll   |   4 +-
 .../inner_scev_sdiv_in_lb_invariant.ll        |   3 +-
 polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll  |   3 +-
 polly/test/CodeGen/intrinsics_lifetime.ll     |   2 +-
 polly/test/CodeGen/intrinsics_misc.ll         |   2 +-
 .../inv-load-lnt-crash-wrong-order-2.ll       |   3 +-
 .../inv-load-lnt-crash-wrong-order-3.ll       |   3 +-
 .../CodeGen/inv-load-lnt-crash-wrong-order.ll |   3 +-
 .../test/CodeGen/invariant-load-dimension.ll  |   4 +-
 ...-load-preload-base-pointer-origin-first.ll |   2 +-
 .../CodeGen/invariant_cannot_handle_void.ll   |   4 +-
 polly/test/CodeGen/invariant_load.ll          |   2 +-
 .../CodeGen/invariant_load_address_space.ll   |   2 +-
 .../CodeGen/invariant_load_alias_metadata.ll  |   3 +-
 .../CodeGen/invariant_load_base_pointer.ll    |   2 +-
 ...invariant_load_base_pointer_conditional.ll |   2 +-
 ...variant_load_base_pointer_conditional_2.ll |   6 +-
 ...ariant_load_canonicalize_array_baseptrs.ll |   4 +-
 .../test/CodeGen/invariant_load_condition.ll  |   2 +-
 .../invariant_load_different_sized_types.ll   |   3 +-
 polly/test/CodeGen/invariant_load_escaping.ll |   2 +-
 .../invariant_load_escaping_second_scop.ll    |   2 +-
 .../invariant_load_in_non_affine_subregion.ll |   2 +-
 polly/test/CodeGen/invariant_load_loop_ub.ll  |   2 +-
 ...ant_load_not_executed_but_in_parameters.ll |   2 +-
 .../test/CodeGen/invariant_load_outermost.ll  |   2 +-
 ...riant_load_parameters_cyclic_dependence.ll |   4 +-
 .../CodeGen/invariant_load_ptr_ptr_noalias.ll |   2 +-
 .../test/CodeGen/invariant_load_scalar_dep.ll |   2 +-
 ...riant_load_scalar_escape_alloca_sharing.ll |   2 +-
 ...oads_from_struct_with_different_types_1.ll |   2 +-
 ...oads_from_struct_with_different_types_2.ll |   2 +-
 ...invariant_loads_ignore_parameter_bounds.ll |   3 +-
 .../invariant_verify_function_failed.ll       |   2 +-
 .../invariant_verify_function_failed_2.ll     |   4 +-
 polly/test/CodeGen/issue56692.ll              |   2 +-
 .../large-numbers-in-boundary-context.ll      |   2 +-
 .../test/CodeGen/load_subset_with_context.ll  |   2 +-
 .../loop-invariant-load-type-mismatch.ll      |   2 +-
 polly/test/CodeGen/loop_with_condition.ll     |   2 +-
 polly/test/CodeGen/loop_with_condition_2.ll   |   2 +-
 .../test/CodeGen/loop_with_condition_ineq.ll  |   2 +-
 .../CodeGen/loop_with_condition_nested.ll     |   4 +-
 ..._conditional_entry_edge_split_hard_case.ll |   2 +-
 polly/test/CodeGen/memcpy_annotations.ll      |   2 +-
 .../multidim-non-matching-typesize-2.ll       |   3 +-
 .../CodeGen/multidim-non-matching-typesize.ll |   3 +-
 ..._2d_parametric_array_static_loop_bounds.ll |   2 +-
 polly/test/CodeGen/multidim_alias_check.ll    |   2 +-
 polly/test/CodeGen/multiple-codegens.ll       |   4 +-
 polly/test/CodeGen/multiple-scops-in-a-row.ll |   2 +-
 .../multiple-types-invariant-load-2.ll        |   3 +-
 .../CodeGen/multiple-types-invariant-load.ll  |   3 +-
 .../multiple_sai_fro_same_base_address.ll     |   4 +-
 polly/test/CodeGen/no-overflow-tracking.ll    |   4 +-
 polly/test/CodeGen/no_guard_bb.ll             |   2 +-
 ...non-affine-dominance-generated-entering.ll |   2 +-
 .../CodeGen/non-affine-exit-node-dominance.ll |   2 +-
 .../non-affine-phi-node-expansion-2.ll        |   3 +-
 .../non-affine-phi-node-expansion-3.ll        |   3 +-
 .../non-affine-phi-node-expansion-4.ll        |   3 +-
 .../CodeGen/non-affine-phi-node-expansion.ll  |   3 +-
 ...e-region-exit-phi-incoming-synthesize-2.ll |   2 +-
 ...ine-region-exit-phi-incoming-synthesize.ll |   2 +-
 .../non-affine-region-implicit-store.ll       |   2 +-
 ...ine-region-phi-references-in-scop-value.ll |   3 +-
 .../non-affine-subregion-dominance-reuse.ll   |   3 +-
 polly/test/CodeGen/non-affine-switch.ll       |   3 +-
 .../non-affine-synthesized-in-branch.ll       |   2 +-
 polly/test/CodeGen/non-affine-update.ll       |   3 +-
 .../non-hoisted-load-needed-as-base-ptr.ll    |   2 +-
 .../test/CodeGen/non_affine_float_compare.ll  |   4 +-
 .../CodeGen/only_non_affine_error_region.ll   |   2 +-
 polly/test/CodeGen/openmp_limit_threads.ll    |  12 +-
 .../test/CodeGen/out-of-scop-phi-node-use.ll  |   2 +-
 polly/test/CodeGen/param_div_div_div_2.ll     |   4 +-
 polly/test/CodeGen/partial_write_array.ll     |   2 +-
 polly/test/CodeGen/partial_write_emptyset.ll  |   2 +-
 ...l_write_full_write_that_appears_partial.ll |   2 +-
 .../partial_write_impossible_restriction.ll   |   2 +-
 polly/test/CodeGen/partial_write_in_region.ll |   5 +-
 .../partial_write_in_region_with_loop.ll      |   5 +-
 .../CodeGen/partial_write_mapped_scalar.ll    |   2 +-
 .../partial_write_mapped_scalar_subregion.ll  |   2 +-
 polly/test/CodeGen/perf_monitoring.ll         |   3 +-
 .../perf_monitoring_cycles_per_scop.ll        |   3 +-
 .../perf_monitoring_trip_counts_per_scop.ll   |   3 +-
 polly/test/CodeGen/phi-defined-before-scop.ll |   2 +-
 .../phi_after_error_block_outside_of_scop.ll  |   2 +-
 .../test/CodeGen/phi_condition_modeling_1.ll  |   2 +-
 .../test/CodeGen/phi_condition_modeling_2.ll  |   2 +-
 .../test/CodeGen/phi_conditional_simple_1.ll  |   4 +-
 .../phi_in_exit_early_lnt_failure_1.ll        |   2 +-
 .../phi_in_exit_early_lnt_failure_2.ll        |   2 +-
 .../phi_in_exit_early_lnt_failure_3.ll        |   2 +-
 .../phi_in_exit_early_lnt_failure_5.ll        |   2 +-
 polly/test/CodeGen/phi_loop_carried_float.ll  |   2 +-
 .../CodeGen/phi_loop_carried_float_escape.ll  |   6 +-
 polly/test/CodeGen/phi_scalar_simple_1.ll     |   2 +-
 polly/test/CodeGen/phi_scalar_simple_2.ll     |   2 +-
 .../CodeGen/phi_with_multi_exiting_edges_2.ll |   2 +-
 polly/test/CodeGen/phi_with_one_exit_edge.ll  |   2 +-
 .../CodeGen/pointer-type-expressions-2.ll     |   4 +-
 .../test/CodeGen/pointer-type-expressions.ll  |   4 +-
 .../pointer-type-pointer-type-comparison.ll   |   4 +-
 polly/test/CodeGen/pointer_rem.ll             |   4 +-
 polly/test/CodeGen/pr25241.ll                 |   2 +-
 polly/test/CodeGen/ptrtoint_as_parameter.ll   |   2 +-
 polly/test/CodeGen/read-only-scalars.ll       |   8 +-
 polly/test/CodeGen/reduction.ll               |   2 +-
 polly/test/CodeGen/reduction_2.ll             |   2 +-
 polly/test/CodeGen/reduction_simple_binary.ll |   2 +-
 polly/test/CodeGen/reggen_domtree_crash.ll    |   2 +-
 .../test/CodeGen/region-with-instructions.ll  |   2 +-
 polly/test/CodeGen/region_exiting-domtree.ll  |   2 +-
 .../CodeGen/region_multiexit_partialwrite.ll  |   2 +-
 ...run-time-condition-with-scev-parameters.ll |   4 +-
 polly/test/CodeGen/run-time-condition.ll      |   2 +-
 .../scalar-references-used-in-scop-compute.ll |   2 +-
 .../test/CodeGen/scalar-store-from-same-bb.ll |   3 +-
 polly/test/CodeGen/scalar_codegen_crash.ll    |   3 +-
 polly/test/CodeGen/scev-backedgetaken.ll      |   2 +-
 .../CodeGen/scev-division-invariant-load.ll   |   2 +-
 polly/test/CodeGen/scev.ll                    |   2 +-
 .../CodeGen/scev_expansion_in_nonaffine.ll    |   3 +-
 .../CodeGen/scev_looking_through_bitcasts.ll  |   2 +-
 .../CodeGen/scop_expander_insert_point.ll     |   3 +-
 polly/test/CodeGen/scop_expander_segfault.ll  |   2 +-
 ...p_never_executed_runtime_check_location.ll |   2 +-
 polly/test/CodeGen/select-base-pointer.ll     |   2 +-
 polly/test/CodeGen/sequential_loops.ll        |   2 +-
 .../CodeGen/simple_loop_non_single_exit.ll    |   2 +-
 .../CodeGen/simple_loop_non_single_exit_2.ll  |   2 +-
 polly/test/CodeGen/simple_non_single_entry.ll |   2 +-
 polly/test/CodeGen/simple_nonaffine_loop.ll   |   2 +-
 .../single_do_loop_int_max_iterations.ll      |   2 +-
 .../single_do_loop_int_param_iterations.ll    |   2 +-
 .../single_do_loop_ll_max_iterations.ll       |   4 +-
 .../CodeGen/single_do_loop_one_iteration.ll   |   2 +-
 .../CodeGen/single_do_loop_scev_replace.ll    |   2 +-
 polly/test/CodeGen/single_loop.ll             |   2 +-
 .../CodeGen/single_loop_int_max_iterations.ll |   2 +-
 .../CodeGen/single_loop_ll_max_iterations.ll  |   2 +-
 .../test/CodeGen/single_loop_one_iteration.ll |   2 +-
 polly/test/CodeGen/single_loop_param.ll       |   2 +-
 .../CodeGen/single_loop_param_less_equal.ll   |   6 +-
 .../CodeGen/single_loop_param_less_than.ll    |   4 +-
 .../CodeGen/single_loop_zero_iterations.ll    |   2 +-
 polly/test/CodeGen/split_edge_of_exit.ll      |   4 +-
 polly/test/CodeGen/split_edges.ll             |   2 +-
 polly/test/CodeGen/split_edges_2.ll           |   2 +-
 polly/test/CodeGen/srem-in-other-bb.ll        |   3 +-
 .../stack-overflow-in-load-hoisting.ll        |   3 +-
 .../test/CodeGen/stmt_split_no_dependence.ll  |   2 +-
 .../CodeGen/switch-in-non-affine-region.ll    |   3 +-
 .../synthesizable_phi_write_after_loop.ll     |   2 +-
 .../test-invalid-operands-for-select-2.ll     |   2 +-
 .../test-invalid-operands-for-select.ll       |   2 +-
 polly/test/CodeGen/test.ll                    |   2 +-
 .../two-loops-right-after-each-other-2.ll     |   2 +-
 .../two-scops-in-row-invalidate-scevs.ll      |   2 +-
 polly/test/CodeGen/two-scops-in-row.ll        |   4 +-
 polly/test/CodeGen/udiv_expansion_position.ll |   2 +-
 .../CodeGen/uninitialized_scalar_memory.ll    |   2 +-
 .../unpredictable-loop-unsynthesizable.ll     |   6 +-
 .../test/CodeGen/variant_load_empty_domain.ll |   2 +-
 .../whole-scop-non-affine-subregion.ll        |   3 +-
 polly/test/DeLICM/confused_order.ll           |   4 +-
 ...ontradicting_assumed_context_and_domain.ll |   2 +-
 polly/test/DeLICM/load-in-cond-inf-loop.ll    |   2 +-
 polly/test/DeLICM/map_memset_zero.ll          |   4 +-
 polly/test/DeLICM/nomap_alreadymapped.ll      |   2 +-
 polly/test/DeLICM/nomap_escaping.ll           |   2 +-
 polly/test/DeLICM/nomap_occupied.ll           |   2 +-
 polly/test/DeLICM/nomap_readonly.ll           |   2 +-
 polly/test/DeLICM/nomap_spuriouswrite.ll      |   2 +-
 polly/test/DeLICM/nomap_storagesize.ll        |   2 +-
 polly/test/DeLICM/nomap_writewrite.ll         |   2 +-
 polly/test/DeLICM/outofquota-reverseDomain.ll |   2 +-
 polly/test/DeLICM/pass_existence.ll           |   6 +-
 polly/test/DeLICM/pr41656.ll                  |   2 +-
 polly/test/DeLICM/pr48783.ll                  |   2 +-
 polly/test/DeLICM/reduction.ll                |   2 +-
 .../DeLICM/reduction_constant_selfconflict.ll |   2 +-
 polly/test/DeLICM/reduction_looprotate.ll     |   2 +-
 .../reduction_looprotate_alwaystaken.ll       |   2 +-
 .../DeLICM/reduction_looprotate_gvnpre.ll     |   4 +-
 .../reduction_looprotate_gvnpre_cond1.ll      |   2 +-
 .../reduction_looprotate_gvnpre_cond2.ll      |   2 +-
 ...reduction_looprotate_gvnpre_nopreheader.ll |   2 +-
 .../DeLICM/reduction_looprotate_hoisted.ll    |   2 +-
 .../test/DeLICM/reduction_looprotate_licm.ll  |   2 +-
 .../test/DeLICM/reduction_looprotate_licm2.ll |   2 +-
 .../reduction_looprotate_licm_double_write.ll |   5 +-
 .../reduction_looprotate_licm_nopreheader.ll  |   2 +-
 .../test/DeLICM/reduction_looprotate_load.ll  |   2 +-
 .../reduction_looprotate_loopguard_gvnpre.ll  |   2 +-
 .../reduction_looprotate_loopguard_licm1.ll   |   2 +-
 .../reduction_looprotate_loopguard_licm2.ll   |   2 +-
 .../reduction_looprotate_loopguard_licm3.ll   |   2 +-
 .../DeLICM/reduction_looprotate_readonly.ll   |   2 +-
 .../reduction_looprotate_synthesizable.ll     |   2 +-
 .../test/DeLICM/reduction_looprotate_undef.ll |   2 +-
 .../test/DeLICM/reduction_overapproximate.ll  |   6 +-
 polly/test/DeLICM/reduction_preheader.ll      |   2 +-
 .../test/DeLICM/reduction_unrelatedunusual.ll |   2 +-
 polly/test/DeLICM/reject_loadafterstore.ll    |   2 +-
 polly/test/DeLICM/reject_outofquota.ll        |   4 +-
 polly/test/DeLICM/reject_storeafterstore.ll   |   2 +-
 polly/test/DeLICM/reject_storeinsubregion.ll  |   2 +-
 polly/test/DeLICM/reject_unusualstore.ll      |   4 +-
 polly/test/DeLICM/skip_maywrite.ll            |   2 +-
 polly/test/DeLICM/skip_multiaccess.ll         |   2 +-
 polly/test/DeLICM/skip_notinloop.ll           |   2 +-
 polly/test/DeLICM/skip_scalaraccess.ll        |   2 +-
 .../DeadCodeElimination/chained_iterations.ll |   4 +-
 .../chained_iterations_2.ll                   |   4 +-
 polly/test/DeadCodeElimination/computeout.ll  |   4 +-
 .../dead_iteration_elimination.ll             |   2 +-
 .../non-affine-affine-mix.ll                  |   2 +-
 polly/test/DeadCodeElimination/non-affine.ll  |   2 +-
 .../test/DeadCodeElimination/null_schedule.ll |   2 +-
 polly/test/DependenceInfo/computeout.ll       |   4 +-
 .../different_schedule_dimensions.ll          |   3 +-
 polly/test/DependenceInfo/do_pluto_matmult.ll |   4 +-
 polly/test/DependenceInfo/fine_grain_dep_0.ll |   4 +-
 .../generate_may_write_dependence_info.ll     |   2 +-
 .../test/DependenceInfo/infeasible_context.ll |   6 +-
 ...writes_do_not_block_must_writes_for_war.ll |   2 +-
 .../nonaffine-condition-buildMemoryAccess.ll  |   2 +-
 .../reduction_complex_location.ll             |   6 +-
 ...ndences_equal_non_reduction_dependences.ll |   2 +-
 .../reduction_dependences_not_null.ll         |   2 +-
 .../reduction_indirect_access.ll              |   2 +-
 ...reduction_and_non_reduction_dependences.ll |   2 +-
 .../reduction_multiple_loops_array_sum.ll     |   6 +-
 .../reduction_multiple_loops_array_sum_2.ll   |   2 +-
 .../reduction_multiple_loops_array_sum_3.ll   |   2 +-
 .../reduction_multiple_reductions.ll          |   2 +-
 .../reduction_multiple_reductions_2.ll        |   2 +-
 .../reduction_only_reduction_like_access.ll   |   2 +-
 ...lly_escaping_intermediate_in_other_stmt.ll |   2 +-
 .../reduction_privatization_deps.ll           |   2 +-
 .../reduction_privatization_deps_2.ll         |   2 +-
 .../reduction_privatization_deps_3.ll         |   2 +-
 .../reduction_privatization_deps_4.ll         |   2 +-
 .../reduction_privatization_deps_5.ll         |   2 +-
 .../test/DependenceInfo/reduction_sequence.ll |   2 +-
 .../DependenceInfo/reduction_simple_iv.ll     |   2 +-
 ...ion_simple_iv_debug_wrapped_dependences.ll |   2 +-
 .../reduction_simple_privatization_deps_2.ll  |   2 +-
 ...n_simple_privatization_deps_w_parameter.ll |   2 +-
 ...duction_two_reductions_different_rloops.ll |   2 +-
 polly/test/DependenceInfo/sequential_loops.ll |   6 +-
 polly/test/FlattenSchedule/gemm.ll            |   2 +-
 polly/test/ForwardOpTree/atax.ll              |   2 +-
 polly/test/ForwardOpTree/changed-kind.ll      |   2 +-
 .../test/ForwardOpTree/forward_from_region.ll |   2 +-
 polly/test/ForwardOpTree/forward_hoisted.ll   |   2 +-
 .../test/ForwardOpTree/forward_instruction.ll |   2 +-
 .../test/ForwardOpTree/forward_into_region.ll |   2 +-
 .../forward_into_region_redundant_use.ll      |   2 +-
 polly/test/ForwardOpTree/forward_load.ll      |   2 +-
 .../forward_load_differentarray.ll            |   2 +-
 .../forward_load_double_write.ll              |   2 +-
 .../ForwardOpTree/forward_load_fromloop.ll    |   2 +-
 .../ForwardOpTree/forward_load_indirect.ll    |   2 +-
 .../forward_load_memset_after.ll              |   2 +-
 .../forward_load_memset_before.ll             |   2 +-
 .../ForwardOpTree/forward_load_tripleuse.ll   |   2 +-
 .../forward_load_unrelatedunusual.ll          |   2 +-
 polly/test/ForwardOpTree/forward_phi_load.ll  |   2 +-
 polly/test/ForwardOpTree/forward_readonly.ll  |   4 +-
 polly/test/ForwardOpTree/forward_reusue.ll    |   2 +-
 polly/test/ForwardOpTree/forward_store.ll     |   2 +-
 .../forward_synthesizable_definloop.ll        |   2 +-
 .../forward_synthesizable_indvar.ll           |   2 +-
 .../forward_synthesizable_useinloop.ll        |   2 +-
 .../test/ForwardOpTree/forward_transitive.ll  |   2 +-
 polly/test/ForwardOpTree/jacobi-1d.ll         |   2 +-
 .../ForwardOpTree/noforward_from_region.ll    |   2 +-
 .../noforward_load_conditional.ll             |   2 +-
 .../noforward_load_writebetween.ll            |   2 +-
 .../ForwardOpTree/noforward_outofquota.ll     |   4 +-
 polly/test/ForwardOpTree/noforward_partial.ll |   2 +-
 polly/test/ForwardOpTree/noforward_phi.ll     |   2 +-
 .../ForwardOpTree/noforward_selfrefphi.ll     |   2 +-
 .../ForwardOpTree/noforward_sideffects.ll     |   2 +-
 .../noforward_synthesizable_unknownit.ll      |   2 +-
 polly/test/ForwardOpTree/out-of-quota1.ll     |   2 +-
 .../OpenMP/multiple_loops_outer_parallel.ll   |   2 +-
 .../OpenMP/nested_loop_both_parallel.ll       |   2 +-
 .../nested_loop_both_parallel_parametric.ll   |   2 +-
 .../OpenMP/nested_loop_inner_parallel.ll      |   2 +-
 .../OpenMP/nested_loop_outer_parallel.ll      |   2 +-
 .../OpenMP/single_loop_param_non_parallel.ll  |   2 +-
 .../OpenMP/single_loop_param_parallel.ll      |   2 +-
 .../single_loop_param_parallel_computeout.ll  |   2 +-
 .../alias_checks_with_empty_context.ll        |   3 +-
 polly/test/IstAstInfo/alias_simple_1.ll       |  10 +-
 polly/test/IstAstInfo/alias_simple_2.ll       |  12 +-
 polly/test/IstAstInfo/alias_simple_3.ll       |  10 +-
 .../aliasing_arrays_with_identical_base.ll    |   4 +-
 .../aliasing_multiple_alias_groups.ll         |   4 +-
 .../aliasing_parametric_simple_1.ll           |   2 +-
 .../aliasing_parametric_simple_2.ll           |   2 +-
 .../dependence_distance_constant.ll           |   2 +-
 .../IstAstInfo/dependence_distance_minimal.ll |   2 +-
 .../dependence_distance_multiple_constant.ll  |   2 +-
 .../dependence_distance_parametric.ll         |   2 +-
 .../dependence_distance_parametric_expr.ll    |   2 +-
 .../IstAstInfo/dependence_distance_varying.ll |   2 +-
 ...pendence_distance_varying_in_outer_loop.ll |   2 +-
 .../dependence_distance_varying_multiple.ll   |   2 +-
 .../domain_bounded_only_with_context.ll       |   2 +-
 polly/test/IstAstInfo/non_affine_access.ll    |   2 +-
 ...duction_clauses_multidimensional_access.ll |   2 +-
 ...reduction_clauses_onedimensional_access.ll |   2 +-
 ...ndences_equal_non_reduction_dependences.ll |   2 +-
 .../reduction_different_reduction_clauses.ll  |   2 +-
 .../IstAstInfo/reduction_in_one_dimension.ll  |   2 +-
 .../IstAstInfo/reduction_loop_reversal.ll     |   2 +-
 ...ction_modulo_and_loop_reversal_schedule.ll |   2 +-
 ...ion_modulo_and_loop_reversal_schedule_2.ll |   2 +-
 .../IstAstInfo/reduction_modulo_schedule.ll   |   2 +-
 ...ion_modulo_schedule_multiple_dimensions.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_2.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_3.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_4.ll |   2 +-
 ...n_modulo_schedule_multiple_dimensions_5.ll |   2 +-
 .../reduction_multiple_dimensions.ll          |   2 +-
 .../reduction_multiple_dimensions_2.ll        |   2 +-
 .../reduction_multiple_dimensions_3.ll        |   2 +-
 .../reduction_multiple_dimensions_4.ll        |   2 +-
 polly/test/IstAstInfo/run-time-condition.ll   |   2 +-
 .../runtime_context_with_error_blocks.ll      |   2 +-
 .../IstAstInfo/simple-run-time-condition.ll   |   2 +-
 .../test/IstAstInfo/single_loop_strip_mine.ll |   4 +-
 .../single_loop_uint_max_iterations.ll        |   2 +-
 .../single_loop_ull_max_iterations.ll         |   2 +-
 .../ImportAccesses-Bad-relation.ll            |   2 +-
 .../ImportAccesses-No-accesses-key.ll         |   2 +-
 .../ImportAccesses-Not-enough-MemAcc.ll       |   2 +-
 .../ImportAccesses-Not-enough-statements.ll   |   2 +-
 .../ImportAccesses-Relation-mispelled.ll      |   2 +-
 .../ImportAccesses-Statements-mispelled.ll    |   2 +-
 ...ImportAccesses-Undeclared-ScopArrayInfo.ll |   2 +-
 .../ImportAccesses-Wrong-number-dimensions.ll |   2 +-
 .../ImportArrays-Mispelled-type.ll            |   2 +-
 .../ImportArrays-Negative-size.ll             |   2 +-
 .../ImportArrays/ImportArrays-No-name.ll      |   2 +-
 .../ImportArrays/ImportArrays-No-sizes-key.ll |   2 +-
 .../ImportArrays/ImportArrays-No-type-key.ll  |   2 +-
 .../ImportContext-Context-mispelled.ll        |   2 +-
 .../ImportContext-Not-parameter-set.ll        |   2 +-
 .../ImportContext-Unvalid-Context.ll          |   2 +-
 .../ImportContext-Wrong-dimension.ll          |   2 +-
 .../ImportSchedule-No-schedule-key.ll         |   2 +-
 .../ImportSchedule-Schedule-not-valid.ll      |   2 +-
 .../ImportSchedule-Statements-mispelled.ll    |   2 +-
 .../ImportSchedule-Wrong-number-statements.ll |   2 +-
 .../load_after_store_same_statement.ll        |   4 +-
 .../read_from_original.ll                     |   4 +-
 .../MaximalStaticExpansion/too_many_writes.ll |   4 +-
 .../working_deps_between_inners.ll            |   2 +-
 .../working_deps_between_inners_phi.ll        |   4 +-
 .../working_expansion.ll                      |   2 +-
 ...sion_multiple_dependences_per_statement.ll |   2 +-
 ...sion_multiple_instruction_per_statement.ll |   2 +-
 .../working_phi_expansion.ll                  |   4 +-
 .../working_phi_two_scalars.ll                |   4 +-
 .../working_value_expansion.ll                |   2 +-
 .../prune_only_scalardeps.ll                  |   2 +-
 .../2012-03-16-Empty-Domain.ll                |   2 +-
 .../2013-04-11-Empty-Domain-two.ll            |   2 +-
 .../GreedyFuse/fuse-double.ll                 |   4 +-
 .../GreedyFuse/fuse-except-first.ll           |   4 +-
 .../GreedyFuse/fuse-except-third.ll           |   4 +-
 .../GreedyFuse/fuse-inner-carried.ll          |   4 +-
 .../GreedyFuse/fuse-inner-third.ll            |   4 +-
 .../GreedyFuse/fuse-inner.ll                  |   4 +-
 .../GreedyFuse/fuse-simple.ll                 |   4 +-
 .../GreedyFuse/nofuse-simple.ll               |   4 +-
 .../GreedyFuse/nofuse-with-middle.ll          |   4 +-
 .../ManualOptimization/disable_nonforced.ll   |   2 +-
 .../distribute_heuristic.ll                   |   4 +-
 .../distribute_illegal_looploc.ll             |   2 +-
 .../distribute_illegal_pragmaloc.ll           |   2 +-
 .../ManualOptimization/unroll_disable.ll      |   2 +-
 .../ManualOptimization/unroll_double.ll       |   2 +-
 .../ManualOptimization/unroll_full.ll         |   2 +-
 .../ManualOptimization/unroll_heuristic.ll    |   4 +-
 .../ManualOptimization/unroll_partial.ll      |   4 +-
 .../unroll_partial_followup.ll                |   8 +-
 .../ScheduleOptimizer/SIMDInParallelFor.ll    |   2 +-
 polly/test/ScheduleOptimizer/computeout.ll    |   4 +-
 .../ensure-correct-tile-sizes.ll              |   7 +-
 .../focaltech_test_detail_threshold-7bc17e.ll |   2 +-
 .../full_partial_tile_separation.ll           |   2 +-
 polly/test/ScheduleOptimizer/line-tiling-2.ll |   2 +-
 polly/test/ScheduleOptimizer/line-tiling.ll   |   2 +-
 .../mat_mul_pattern_data_layout.ll            |  11 +-
 .../mat_mul_pattern_data_layout_2.ll          |  10 +-
 .../ScheduleOptimizer/one-dimensional-band.ll |   2 +-
 .../ScheduleOptimizer/outer_coincidence.ll    |   4 +-
 ...attern-matching-based-opts-after-delicm.ll |   6 +-
 ...tern-matching-based-opts-after-delicm_2.ll |   4 +-
 .../pattern-matching-based-opts.ll            |   9 +-
 .../pattern-matching-based-opts_11.ll         |  14 +-
 .../pattern-matching-based-opts_12.ll         |  10 +-
 .../pattern-matching-based-opts_13.ll         |  10 +-
 .../pattern-matching-based-opts_14.ll         |  11 +-
 .../pattern-matching-based-opts_15.ll         |   4 +-
 .../pattern-matching-based-opts_16.ll         |   3 +-
 .../pattern-matching-based-opts_17.ll         |   3 +-
 .../pattern-matching-based-opts_18.ll         |   3 +-
 .../pattern-matching-based-opts_19.ll         |   3 +-
 .../pattern-matching-based-opts_2.ll          |   3 +-
 .../pattern-matching-based-opts_20.ll         |   3 +-
 .../pattern-matching-based-opts_21.ll         |   3 +-
 .../pattern-matching-based-opts_22.ll         |   3 +-
 .../pattern-matching-based-opts_24.ll         |   4 +-
 .../pattern-matching-based-opts_25.ll         |   3 +-
 .../pattern-matching-based-opts_3.ll          |  17 +-
 .../pattern-matching-based-opts_4.ll          |  12 +-
 .../pattern-matching-based-opts_5.ll          |  10 +-
 .../pattern-matching-based-opts_6.ll          |  10 +-
 .../pattern-matching-based-opts_7.ll          |  10 +-
 .../pattern-matching-based-opts_8.ll          |  10 +-
 .../pattern-matching-based-opts_9.ll          |  12 +-
 .../pattern_matching_based_opts_splitmap.ll   |   2 +-
 .../prevectorization-without-tiling.ll        |   2 +-
 .../ScheduleOptimizer/prevectorization.ll     |   4 +-
 .../prevectorization_islbound.ll              |   2 +-
 .../ScheduleOptimizer/rectangular-tiling.ll   |   8 +-
 .../ScheduleOptimizer/schedule_computeout.ll  |   2 +-
 polly/test/ScheduleOptimizer/statistics.ll    |   2 +-
 .../ScheduleOptimizer/tile_after_fusion.ll    |   2 +-
 ...vivid_vbi_gen_sliced-before-llvmreduced.ll |   2 +-
 .../aliasing_parametric_simple_1.ll           |   2 +-
 .../aliasing_parametric_simple_2.ll           |   2 +-
 polly/test/ScopDetect/aliasing_simple_1.ll    |   2 +-
 polly/test/ScopDetect/aliasing_simple_2.ll    |   2 +-
 polly/test/ScopDetect/base_pointer.ll         |   2 +-
 .../base_pointer_load_setNewAccessRelation.ll |   2 +-
 .../base_pointer_setNewAccessRelation.ll      |   4 +-
 polly/test/ScopDetect/callbr.ll               |   4 +-
 .../ScopDetect/collective_invariant_loads.ll  |   2 +-
 .../ScopDetect/cross_loop_non_single_exit.ll  |   2 +-
 .../cross_loop_non_single_exit_2.ll           |   2 +-
 ...ependency_to_phi_node_outside_of_region.ll |   2 +-
 .../test/ScopDetect/detect-full-functions.ll  |   2 +-
 polly/test/ScopDetect/dom-tree-crash.ll       |   2 +-
 polly/test/ScopDetect/dot-scops-npm.ll        |   2 +-
 polly/test/ScopDetect/dot-scops.ll            |   2 +-
 .../ScopDetect/error-block-always-executed.ll |   2 +-
 .../error-block-referenced-from-scop.ll       |   2 +-
 .../ScopDetect/error-block-unreachable.ll     |   2 +-
 .../ScopDetect/expand-region-correctly-2.ll   |   2 +-
 .../ScopDetect/expand-region-correctly.ll     |   2 +-
 .../test/ScopDetect/ignore_func_flag_regex.ll |   2 +-
 .../index_from_unpredictable_loop.ll          |   4 +-
 .../index_from_unpredictable_loop2.ll         |   4 +-
 polly/test/ScopDetect/indvars.ll              |   2 +-
 polly/test/ScopDetect/intrinsics_1.ll         |   2 +-
 polly/test/ScopDetect/intrinsics_2.ll         |   2 +-
 polly/test/ScopDetect/intrinsics_3.ll         |   2 +-
 .../ScopDetect/invalid-latch-conditions.ll    |   6 +-
 .../ScopDetect/invalidate_scalar_evolution.ll |   2 +-
 .../ScopDetect/invariant-load-before-scop.ll  |   2 +-
 polly/test/ScopDetect/keep_going_expansion.ll |   2 +-
 polly/test/ScopDetect/mod_ref_read_pointer.ll |   4 +-
 polly/test/ScopDetect/more-than-one-loop.ll   |   4 +-
 .../ScopDetect/multidim-with-undef-size.ll    |   2 +-
 polly/test/ScopDetect/multidim.ll             |   2 +-
 .../ScopDetect/multidim_indirect_access.ll    |   2 +-
 ..._two_accesses_different_delinearization.ll |   2 +-
 .../ScopDetect/nested_loop_single_exit.ll     |   4 +-
 .../test/ScopDetect/non-affine-conditional.ll |   2 +-
 .../ScopDetect/non-affine-float-compare.ll    |   2 +-
 ...-affine-loop-condition-dependent-access.ll |   8 +-
 ...ffine-loop-condition-dependent-access_2.ll |   6 +-
 ...ffine-loop-condition-dependent-access_3.ll |   6 +-
 polly/test/ScopDetect/non-affine-loop.ll      |  10 +-
 .../non-beneficial-loops-small-trip-count.ll  |   2 +-
 .../non-constant-add-rec-start-expr.ll        |   2 +-
 .../ScopDetect/non-simple-memory-accesses.ll  |   2 +-
 .../ScopDetect/non_affine_loop_condition.ll   |   4 +-
 polly/test/ScopDetect/only-one-affine-loop.ll |   2 +-
 polly/test/ScopDetect/only_func_flag.ll       |   2 +-
 polly/test/ScopDetect/only_func_flag_regex.ll |   2 +-
 .../parametric-multiply-in-scev-2.ll          |   2 +-
 .../ScopDetect/parametric-multiply-in-scev.ll |   2 +-
 .../phi_with_multi_exiting_edges.ll           |   2 +-
 .../profitability-large-basic-blocks.ll       |  10 +-
 .../profitability-two-nested-loops.ll         |   2 +-
 polly/test/ScopDetect/remove_all_children.ll  |   2 +-
 polly/test/ScopDetect/report-scop-location.ll |   2 +-
 .../restrict-undef-size-scopdetect.ll         |   2 +-
 polly/test/ScopDetect/run_time_alias_check.ll |   2 +-
 polly/test/ScopDetect/scev_remove_max.ll      |   2 +-
 polly/test/ScopDetect/sequential_loops.ll     |   2 +-
 polly/test/ScopDetect/simple_loop.ll          |   2 +-
 .../simple_loop_non_single_entry.ll           |   2 +-
 .../ScopDetect/simple_loop_non_single_exit.ll |   2 +-
 .../simple_loop_non_single_exit_2.ll          |   2 +-
 .../ScopDetect/simple_loop_two_phi_nodes.ll   |   2 +-
 .../test/ScopDetect/simple_loop_with_param.ll |   2 +-
 .../ScopDetect/simple_loop_with_param_2.ll    |   2 +-
 .../ScopDetect/simple_non_single_entry.ll     |   2 +-
 .../ScopDetect/skip_function_attribute.ll     |   2 +-
 .../srem_with_parametric_divisor.ll           |   2 +-
 polly/test/ScopDetect/statistics.ll           |   2 +-
 polly/test/ScopDetect/switch-in-loop-patch.ll |   2 +-
 .../test/ScopDetect/tlr_is_hoistable_load.ll  |   2 +-
 .../ReportAlias-01.ll                         |   2 +-
 .../ScopDetectionDiagnostics/ReportEntry.ll   |   2 +-
 .../ReportFuncCall-01.ll                      |   2 +-
 .../ReportIrreducibleRegion.ll                |   2 +-
 .../ReportIrreducibleRegionWithoutDebugLoc.ll |   2 +-
 .../ReportLoopBound-01.ll                     |  16 +-
 .../ReportLoopHasNoExit.ll                    |   4 +-
 .../ReportMultipleNonAffineAccesses.ll        |  12 +-
 .../ReportNonAffineAccess-01.ll               |   2 +-
 .../ReportUnprofitable.ll                     |   8 +-
 .../ReportUnreachableInExit.ll                |   3 +-
 .../ReportVariantBasePtr-01.ll                |   2 +-
 .../loop_has_multiple_exits.ll                |   2 +-
 .../loop_partially_in_scop-2.ll               |   2 +-
 .../loop_partially_in_scop.ll                 |   2 +-
 .../ScopInfo/20110312-Fail-without-basicaa.ll |   2 +-
 .../20111108-Parameter-not-detected.ll        |   2 +-
 ...03-16-Crash-because-of-unsigned-in-scev.ll |   2 +-
 .../2015-10-04-Crash-in-domain-generation.ll  |   2 +-
 polly/test/ScopInfo/Alias-0.ll                |   4 +-
 polly/test/ScopInfo/Alias-1.ll                |   4 +-
 polly/test/ScopInfo/Alias-2.ll                |   4 +-
 polly/test/ScopInfo/Alias-3.ll                |   4 +-
 polly/test/ScopInfo/Alias-4.ll                |   4 +-
 .../test/ScopInfo/BoundChecks/single-loop.ll  |   4 +-
 polly/test/ScopInfo/BoundChecks/two-loops.ll  |   4 +-
 polly/test/ScopInfo/NonAffine/div_backedge.ll |   2 +-
 polly/test/ScopInfo/NonAffine/div_domain.ll   |   2 +-
 ...nt_loads_dependent_in_non_affine_region.ll |   2 +-
 .../ScopInfo/NonAffine/modulo_backedge.ll     |   2 +-
 .../test/ScopInfo/NonAffine/modulo_domain.ll  |   2 +-
 ...ffine-loop-condition-dependent-access_1.ll |   4 +-
 ...ffine-loop-condition-dependent-access_2.ll |   6 +-
 ...ffine-loop-condition-dependent-access_3.ll |   6 +-
 .../non_affine_access_with_range_2.ll         |   2 +-
 .../ScopInfo/NonAffine/non_affine_but_sdiv.ll |   2 +-
 .../ScopInfo/NonAffine/non_affine_but_srem.ll |   2 +-
 .../non_affine_conditional_nested.ll          |   2 +-
 ...ine_conditional_surrounding_affine_loop.ll |  11 +-
 ...conditional_surrounding_non_affine_loop.ll |  16 +-
 .../NonAffine/non_affine_float_compare.ll     |   2 +-
 .../NonAffine/non_affine_loop_condition.ll    |   6 +-
 .../NonAffine/non_affine_loop_used_later.ll   |   4 +-
 .../NonAffine/non_affine_parametric_loop.ll   |   2 +-
 .../non_affine_region_guaranteed_non-entry.ll |   2 +-
 ...whole-scop-non-affine-subregion-in-loop.ll |   2 +-
 .../aliasing_conditional_alias_groups_1.ll    |   2 +-
 .../aliasing_conditional_alias_groups_2.ll    |   2 +-
 polly/test/ScopInfo/aliasing_dead_access.ll   |   2 +-
 .../aliasing_many_arrays_to_compare.ll        |   7 +-
 ...iasing_many_parameters_not_all_involved.ll |   4 +-
 .../aliasing_many_read_only_acesses.ll        |   2 +-
 .../aliasing_multiple_alias_groups.ll         |   4 +-
 .../aliasing_with_non_affine_access.ll        |   2 +-
 .../allow-all-parameters-dereferencable.ll    |  11 +-
 polly/test/ScopInfo/assume_gep_bounds.ll      |   4 +-
 polly/test/ScopInfo/assume_gep_bounds_2.ll    |   3 +-
 polly/test/ScopInfo/assume_gep_bounds_many.ll |   3 +-
 .../avoid_new_parameters_from_geps.ll         |   2 +-
 polly/test/ScopInfo/bool-addrec.ll            |   2 +-
 .../test/ScopInfo/bounded_loop_assumptions.ll |   2 +-
 ...ces-loop-scev-with-unknown-iterations-2.ll |   6 +-
 ...ces-loop-scev-with-unknown-iterations-3.ll |   7 +-
 ...ences-loop-scev-with-unknown-iterations.ll |   7 +-
 polly/test/ScopInfo/bug_2010_10_22.ll         |   2 +-
 polly/test/ScopInfo/bug_2011_1_5.ll           |   2 +-
 .../test/ScopInfo/bug_scev_not_fully_eval.ll  |   2 +-
 polly/test/ScopInfo/cfg_consequences.ll       |   2 +-
 .../test/ScopInfo/complex-branch-structure.ll |   3 +-
 polly/test/ScopInfo/complex-condition.ll      |   4 +-
 polly/test/ScopInfo/complex-expression.ll     |   4 +-
 polly/test/ScopInfo/complex-loop-nesting.ll   |   2 +-
 .../ScopInfo/complex-successor-structure-2.ll |   4 +-
 .../ScopInfo/complex-successor-structure-3.ll |   3 +-
 .../ScopInfo/complex-successor-structure.ll   |   4 +-
 .../complex_domain_binary_condition.ll        |   3 +-
 .../ScopInfo/complex_execution_context.ll     |   4 +-
 polly/test/ScopInfo/cond_constant_in_loop.ll  |   2 +-
 polly/test/ScopInfo/cond_in_loop.ll           |   2 +-
 .../ScopInfo/condition-after-error-block-2.ll |   2 +-
 ...condition-after-error-block-before-scop.ll |   2 +-
 .../ScopInfo/condtion-after-error-block.ll    |   2 +-
 polly/test/ScopInfo/const_srem_sdiv.ll        |   3 +-
 .../constant-non-integer-branch-condition.ll  |   2 +-
 .../ScopInfo/constant_factor_in_parameter.ll  |   4 +-
 ...stant_functions_outside_scop_as_unknown.ll |   2 +-
 polly/test/ScopInfo/constant_start_integer.ll |   2 +-
 polly/test/ScopInfo/debug_call.ll             |   2 +-
 .../delinearize-together-all-data-refs.ll     |   2 +-
 polly/test/ScopInfo/div_by_zero.ll            |   2 +-
 .../do-not-model-error-block-accesses.ll      |   2 +-
 .../eager-binary-and-or-conditions.ll         |   4 +-
 .../early_exit_for_complex_domains.ll         |   2 +-
 polly/test/ScopInfo/error-blocks-1.ll         |   2 +-
 polly/test/ScopInfo/error-blocks-2.ll         |   3 +-
 polly/test/ScopInfo/error-blocks-3.ll         |   2 +-
 polly/test/ScopInfo/escaping_empty_scop.ll    |   2 +-
 polly/test/ScopInfo/exit-phi-1.ll             |   4 +-
 polly/test/ScopInfo/exit-phi-2.ll             |   2 +-
 polly/test/ScopInfo/exit_phi_accesses-2.ll    |   2 +-
 polly/test/ScopInfo/exit_phi_accesses.ll      |   2 +-
 .../ScopInfo/expensive-boundary-context.ll    |   3 +-
 ...onstant_factor_introduces_new_parameter.ll |   4 +-
 polly/test/ScopInfo/full-function.ll          |   6 +-
 polly/test/ScopInfo/granularity_same_name.ll  |   8 +-
 .../test/ScopInfo/granularity_scalar-indep.ll |   2 +-
 ...ity_scalar-indep_cross-referencing-phi1.ll |   2 +-
 ...ity_scalar-indep_cross-referencing-phi2.ll |   2 +-
 .../granularity_scalar-indep_epilogue.ll      |   2 +-
 .../granularity_scalar-indep_epilogue_last.ll |   2 +-
 .../granularity_scalar-indep_noepilogue.ll    |   2 +-
 .../granularity_scalar-indep_ordered-2.ll     |   2 +-
 .../granularity_scalar-indep_ordered.ll       |   2 +-
 polly/test/ScopInfo/i1_params.ll              |   2 +-
 polly/test/ScopInfo/infeasible-rtc.ll         |   6 +-
 .../ScopInfo/infeasible_invalid_context.ll    |   6 +-
 polly/test/ScopInfo/int2ptr_ptr2int.ll        |   4 +-
 polly/test/ScopInfo/int2ptr_ptr2int_2.ll      |   6 +-
 polly/test/ScopInfo/integers.ll               |   2 +-
 .../ScopInfo/inter-error-bb-dependence.ll     |   2 +-
 polly/test/ScopInfo/inter_bb_scalar_dep.ll    |   3 +-
 .../intra-non-affine-stmt-phi-node.ll         |   3 +-
 .../ScopInfo/intra_and_inter_bb_scalar_dep.ll |   3 +-
 polly/test/ScopInfo/intra_bb_scalar_dep.ll    |   3 +-
 polly/test/ScopInfo/intrinsics.ll             |   2 +-
 ..._add_rec_after_invariant_load_remapping.ll |   2 +-
 .../invalidate_iterator_during_MA_removal.ll  |   2 +-
 .../test/ScopInfo/invariant-load-instlist.ll  |   2 +-
 ...ariant-loads-leave-read-only-statements.ll |   4 +-
 polly/test/ScopInfo/invariant_load.ll         |   2 +-
 ...load_access_classes_different_base_type.ll |   4 +-
 ...ss_classes_different_base_type_escaping.ll |   4 +-
 ...lasses_different_base_type_same_pointer.ll |   4 +-
 ...fferent_base_type_same_pointer_escaping.ll |   4 +-
 .../ScopInfo/invariant_load_addrec_sum.ll     |   2 +-
 .../ScopInfo/invariant_load_base_pointer.ll   |   2 +-
 ...invariant_load_base_pointer_conditional.ll |   2 +-
 ...ariant_load_base_pointer_in_conditional.ll |   2 +-
 .../invariant_load_branch_condition.ll        |   3 +-
 ...ariant_load_canonicalize_array_baseptrs.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_2.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_3.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_4.ll |   4 +-
 ...ant_load_canonicalize_array_baseptrs_4b.ll |   4 +-
 ...ant_load_canonicalize_array_baseptrs_4c.ll |   4 +-
 ...iant_load_canonicalize_array_baseptrs_5.ll |   4 +-
 .../invariant_load_complex_condition.ll       |   3 +-
 .../test/ScopInfo/invariant_load_condition.ll |   2 +-
 .../invariant_load_dereferenceable.ll         |   4 +-
 ...iant_load_distinct_parameter_valuations.ll |   2 +-
 .../ScopInfo/invariant_load_in_non_affine.ll  |   3 +-
 polly/test/ScopInfo/invariant_load_loop_ub.ll |   4 +-
 .../invariant_load_ptr_ptr_noalias.ll         |   3 +-
 .../ScopInfo/invariant_load_scalar_dep.ll     |   2 +-
 .../ScopInfo/invariant_load_stmt_domain.ll    |   2 +-
 .../invariant_load_zext_parameter-2.ll        |   4 +-
 .../ScopInfo/invariant_load_zext_parameter.ll |   4 +-
 ...load_zextended_in_own_execution_context.ll |   4 +-
 ...invariant_loads_complicated_dependences.ll |   2 +-
 .../invariant_loads_cyclic_dependences.ll     |   2 +-
 polly/test/ScopInfo/invariant_loop_bounds.ll  |   2 +-
 ...ariant_same_loop_bound_multiple_times-1.ll |   2 +-
 ...ariant_same_loop_bound_multiple_times-2.ll |   2 +-
 polly/test/ScopInfo/isl_aff_out_of_bounds.ll  |   2 +-
 polly/test/ScopInfo/isl_trip_count_01.ll      |   2 +-
 polly/test/ScopInfo/isl_trip_count_02.ll      |   2 +-
 polly/test/ScopInfo/isl_trip_count_03.ll      |   2 +-
 .../isl_trip_count_multiple_exiting_blocks.ll |   2 +-
 polly/test/ScopInfo/licm_load.ll              |  31 +-
 polly/test/ScopInfo/licm_potential_store.ll   |  79 +++-
 .../ScopInfo/licm_potential_store_mssa.ll     |  50 ---
 polly/test/ScopInfo/licm_reduction_nested.ll  |   4 +-
 .../long-compile-time-alias-analysis.ll       |   2 +-
 .../long-sequence-of-error-blocks-2.ll        |   2 +-
 .../ScopInfo/long-sequence-of-error-blocks.ll |   3 +-
 .../test/ScopInfo/loop-multiexit-succ-cond.ll |   4 +-
 polly/test/ScopInfo/loop_affine_bound_0.ll    |   4 +-
 polly/test/ScopInfo/loop_affine_bound_1.ll    |   4 +-
 polly/test/ScopInfo/loop_affine_bound_2.ll    |   4 +-
 polly/test/ScopInfo/loop_carry.ll             |   2 +-
 .../test/ScopInfo/many-scalar-dependences.ll  |   2 +-
 polly/test/ScopInfo/max-loop-depth.ll         |   2 +-
 polly/test/ScopInfo/memcpy-raw-source.ll      |   2 +-
 polly/test/ScopInfo/memcpy.ll                 |   4 +-
 polly/test/ScopInfo/memmove.ll                |   4 +-
 polly/test/ScopInfo/memset.ll                 |   4 +-
 polly/test/ScopInfo/memset_null.ll            |   4 +-
 .../ScopInfo/mismatching-array-dimensions.ll  |   2 +-
 .../mod_ref_access_pointee_arguments.ll       |   6 +-
 .../mod_ref_read_pointee_arguments.ll         |   6 +-
 polly/test/ScopInfo/mod_ref_read_pointer.ll   |   4 +-
 polly/test/ScopInfo/mod_ref_read_pointers.ll  |   6 +-
 polly/test/ScopInfo/modulo_zext_1.ll          |   2 +-
 polly/test/ScopInfo/modulo_zext_2.ll          |   2 +-
 polly/test/ScopInfo/modulo_zext_3.ll          |   2 +-
 polly/test/ScopInfo/multi-scop.ll             |   2 +-
 .../ScopInfo/multidim_2d-diagonal-matrix.ll   |   4 +-
 .../multidim_2d_outer_parametric_offset.ll    |   2 +-
 ..._2d_parametric_array_static_loop_bounds.ll |   2 +-
 .../ScopInfo/multidim_2d_with_modref_call.ll  |   8 +-
 .../multidim_2d_with_modref_call_2.ll         |   8 +-
 ..._3d_parametric_array_static_loop_bounds.ll |   2 +-
 ...idim_fixedsize_different_dimensionality.ll |   2 +-
 .../multidim_fixedsize_multi_offset.ll        |   2 +-
 .../ScopInfo/multidim_fold_constant_dim.ll    |   2 +-
 .../multidim_fold_constant_dim_zero.ll        |   2 +-
 polly/test/ScopInfo/multidim_fortran_2d.ll    |   3 +-
 .../ScopInfo/multidim_fortran_2d_params.ll    |   4 +-
 .../multidim_fortran_2d_with_modref_call.ll   |   8 +-
 polly/test/ScopInfo/multidim_fortran_srem.ll  |   2 +-
 .../test/ScopInfo/multidim_gep_pointercast.ll |   2 +-
 .../ScopInfo/multidim_gep_pointercast2.ll     |   2 +-
 .../ScopInfo/multidim_invalid_dimension.ll    |   2 +-
 .../multidim_ivs_and_integer_offsets_3d.ll    |   2 +-
 ...multidim_ivs_and_parameteric_offsets_3d.ll |   2 +-
 .../test/ScopInfo/multidim_many_references.ll |   4 +-
 .../ScopInfo/multidim_nested_start_integer.ll |   4 +-
 .../multidim_nested_start_share_parameter.ll  |   2 +-
 polly/test/ScopInfo/multidim_only_ivs_2d.ll   |   2 +-
 polly/test/ScopInfo/multidim_only_ivs_3d.ll   |   2 +-
 .../ScopInfo/multidim_only_ivs_3d_cast.ll     |   2 +-
 .../ScopInfo/multidim_only_ivs_3d_reverse.ll  |   2 +-
 .../ScopInfo/multidim_param_in_subscript-2.ll |   2 +-
 .../ScopInfo/multidim_param_in_subscript.ll   |   2 +-
 .../multidim_parameter_addrec_product.ll      |   2 +-
 .../multidim_single_and_multidim_array.ll     |  16 +-
 polly/test/ScopInfo/multidim_srem.ll          |   2 +-
 polly/test/ScopInfo/multidim_with_bitcast.ll  |   2 +-
 .../ScopInfo/multiple-binary-or-conditions.ll |   4 +-
 ...ss-offset-not-dividable-by-element-size.ll |   4 +-
 .../ScopInfo/multiple-types-non-affine-2.ll   |   4 +-
 .../ScopInfo/multiple-types-non-affine.ll     |   4 +-
 .../multiple-types-non-power-of-two-2.ll      |   2 +-
 .../multiple-types-non-power-of-two.ll        |   2 +-
 .../multiple-types-two-dimensional-2.ll       |   4 +-
 .../multiple-types-two-dimensional.ll         |   4 +-
 polly/test/ScopInfo/multiple-types.ll         |   3 +-
 .../test/ScopInfo/multiple_exiting_blocks.ll  |   2 +-
 .../multiple_exiting_blocks_two_loop.ll       |   2 +-
 polly/test/ScopInfo/multiple_latch_blocks.ll  |   2 +-
 polly/test/ScopInfo/nested-loops.ll           |   2 +-
 .../no-scalar-deps-in-non-affine-subregion.ll |   2 +-
 polly/test/ScopInfo/non-affine-region-phi.ll  |   4 +-
 .../ScopInfo/non-affine-region-with-loop-2.ll |   2 +-
 .../ScopInfo/non-affine-region-with-loop.ll   |   4 +-
 polly/test/ScopInfo/non-precise-inv-load-1.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-2.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-3.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-4.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-5.ll |   2 +-
 polly/test/ScopInfo/non-precise-inv-load-6.ll |   2 +-
 polly/test/ScopInfo/non-pure-function-call.ll |   2 +-
 ...-pure-function-calls-causes-dead-blocks.ll |   2 +-
 .../test/ScopInfo/non-pure-function-calls.ll  |   2 +-
 polly/test/ScopInfo/non_affine_access.ll      |   4 +-
 polly/test/ScopInfo/non_affine_region_1.ll    |   2 +-
 polly/test/ScopInfo/non_affine_region_2.ll    |   2 +-
 polly/test/ScopInfo/non_affine_region_3.ll    |   4 +-
 polly/test/ScopInfo/non_affine_region_4.ll    |   2 +-
 .../ScopInfo/nonaffine-buildMemoryAccess.ll   |   2 +-
 polly/test/ScopInfo/not-a-reduction.ll        |   2 +-
 polly/test/ScopInfo/opaque-struct.ll          |   2 +-
 ...gion-entry-phi-node-nonaffine-subregion.ll |   2 +-
 ...ut-of-scop-use-in-region-entry-phi-node.ll |   2 +-
 .../ScopInfo/parameter-constant-division.ll   |   4 +-
 .../ScopInfo/parameter_in_dead_statement.ll   |   6 +-
 polly/test/ScopInfo/parameter_product.ll      |   2 +-
 .../parameter_with_constant_factor_in_add.ll  |   2 +-
 .../ScopInfo/partially_invariant_load_1.ll    |   4 +-
 .../ScopInfo/partially_invariant_load_2.ll    |   2 +-
 .../test/ScopInfo/phi-in-non-affine-region.ll |   2 +-
 polly/test/ScopInfo/phi_after_error_block.ll  |   2 +-
 .../test/ScopInfo/phi_condition_modeling_1.ll |   2 +-
 .../test/ScopInfo/phi_condition_modeling_2.ll |   2 +-
 .../test/ScopInfo/phi_conditional_simple_1.ll |   2 +-
 polly/test/ScopInfo/phi_loop_carried_float.ll |   2 +-
 polly/test/ScopInfo/phi_not_grouped_at_top.ll |   2 +-
 polly/test/ScopInfo/phi_scalar_simple_1.ll    |   2 +-
 polly/test/ScopInfo/phi_scalar_simple_2.ll    |   2 +-
 polly/test/ScopInfo/phi_with_invoke_edge.ll   |   2 +-
 .../ScopInfo/pointer-comparison-no-nsw.ll     |   2 +-
 polly/test/ScopInfo/pointer-comparison.ll     |   2 +-
 .../test/ScopInfo/pointer-type-expressions.ll |   2 +-
 ...er-used-as-base-pointer-and-scalar-read.ll |   2 +-
 .../polly-timeout-parameter-bounds.ll         |   2 +-
 polly/test/ScopInfo/pr38218.ll                |   2 +-
 ...eserve-equiv-class-order-in-basic_block.ll |   2 +-
 .../test/ScopInfo/process_added_dimensions.ll |   2 +-
 .../test/ScopInfo/pwaff-complexity-bailout.ll |   2 +-
 polly/test/ScopInfo/ranged_parameter.ll       |   2 +-
 polly/test/ScopInfo/ranged_parameter_2.ll     |   3 +-
 polly/test/ScopInfo/ranged_parameter_wrap.ll  |   2 +-
 .../test/ScopInfo/ranged_parameter_wrap_2.ll  |   2 +-
 .../read-only-scalar-used-in-phi-2.ll         |   2 +-
 .../ScopInfo/read-only-scalar-used-in-phi.ll  |   2 +-
 polly/test/ScopInfo/read-only-scalars.ll      |   4 +-
 polly/test/ScopInfo/read-only-statements.ll   |   2 +-
 .../ScopInfo/reduction_alternating_base.ll    |   2 +-
 ...uction_chain_partially_outside_the_scop.ll |   2 +-
 .../ScopInfo/reduction_different_index.ll     |   2 +-
 .../ScopInfo/reduction_different_index1.ll    |   2 +-
 .../reduction_disabled_multiplicative.ll      |   2 +-
 polly/test/ScopInfo/reduction_double.ll       |   2 +-
 .../reduction_escaping_intermediate.ll        |   2 +-
 .../reduction_escaping_intermediate_2.ll      |   2 +-
 .../reduction_escaping_intermediate_3.ll      |   2 +-
 polly/test/ScopInfo/reduction_if.ll           |   2 +-
 .../ScopInfo/reduction_indirect_access.ll     |   2 +-
 .../ScopInfo/reduction_indirect_access_2.ll   |   2 +-
 .../reduction_invalid_different_operators.ll  |   2 +-
 .../reduction_invalid_overlapping_accesses.ll |   2 +-
 .../reduction_long_reduction_chain.ll         |   2 +-
 ...duction_long_reduction_chain_double_use.ll |   2 +-
 .../reduction_multiple_different_operators.ll |   2 +-
 .../reduction_multiple_loops_array_sum.ll     |   2 +-
 .../reduction_multiple_loops_array_sum_1.ll   |   2 +-
 .../reduction_multiple_simple_binary.ll       |   2 +-
 .../reduction_non_overlapping_chains.ll       |   2 +-
 .../reduction_only_reduction_like_access.ll   |   2 +-
 polly/test/ScopInfo/reduction_simple_fp.ll    |   2 +-
 .../ScopInfo/reduction_simple_w_constant.ll   |   2 +-
 polly/test/ScopInfo/reduction_simple_w_iv.ll  |   2 +-
 .../ScopInfo/reduction_two_identical_reads.ll |   4 +-
 .../redundant_parameter_constraint.ll         |   2 +-
 .../test/ScopInfo/region-with-instructions.ll |   2 +-
 polly/test/ScopInfo/remarks.ll                |   3 +-
 .../required-invariant-loop-bounds.ll         |   3 +-
 .../ScopInfo/restriction_in_dead_block.ll     |   2 +-
 .../run-time-check-many-array-disjuncts.ll    |   5 +-
 .../run-time-check-many-parameters.ll         |   2 +-
 .../run-time-check-many-piecewise-aliasing.ll |   5 +-
 .../run-time-check-read-only-arrays.ll        |   2 +-
 .../same-base-address-scalar-and-array.ll     |   2 +-
 polly/test/ScopInfo/scalar.ll                 |   2 +-
 .../ScopInfo/scalar_dependence_cond_br.ll     |   2 +-
 polly/test/ScopInfo/scalar_to_array.ll        |   4 +-
 .../scev-div-with-evaluatable-divisor.ll      |   2 +-
 polly/test/ScopInfo/scev-invalidated.ll       |   2 +-
 .../schedule-const-post-dominator-walk-2.ll   |   2 +-
 .../schedule-const-post-dominator-walk.ll     |   2 +-
 .../schedule-constuction-endless-loop1.ll     |   2 +-
 .../schedule-constuction-endless-loop2.ll     |   2 +-
 ...tly-contructed-in-case-of-infinite-loop.ll |   2 +-
 .../scop-affine-parameter-ordering.ll         |   2 +-
 polly/test/ScopInfo/sign_wrapped_set.ll       |   2 +-
 polly/test/ScopInfo/simple_loop_1.ll          |   2 +-
 polly/test/ScopInfo/simple_loop_2.ll          |   2 +-
 polly/test/ScopInfo/simple_loop_unsigned.ll   |   2 +-
 polly/test/ScopInfo/simple_loop_unsigned_2.ll |   2 +-
 polly/test/ScopInfo/simple_loop_unsigned_3.ll |   2 +-
 .../ScopInfo/simple_nonaffine_loop_not.ll     |   2 +-
 polly/test/ScopInfo/smax.ll                   |   2 +-
 polly/test/ScopInfo/statistics.ll             |   2 +-
 .../stmt_split_exit_of_region_stmt.ll         |   2 +-
 .../ScopInfo/stmt_split_no_after_split.ll     |   2 +-
 .../test/ScopInfo/stmt_split_no_dependence.ll |   2 +-
 polly/test/ScopInfo/stmt_split_on_store.ll    |   2 +-
 .../ScopInfo/stmt_split_on_synthesizable.ll   |   2 +-
 .../stmt_split_phi_in_beginning_bb.ll         |   2 +-
 polly/test/ScopInfo/stmt_split_phi_in_stmt.ll |   2 +-
 .../ScopInfo/stmt_split_scalar_dependence.ll  |   2 +-
 polly/test/ScopInfo/stmt_split_within_loop.ll |   2 +-
 .../stmt_with_read_but_without_sideffect.ll   |   2 +-
 polly/test/ScopInfo/switch-1.ll               |   4 +-
 polly/test/ScopInfo/switch-2.ll               |   4 +-
 polly/test/ScopInfo/switch-3.ll               |   4 +-
 polly/test/ScopInfo/switch-4.ll               |   4 +-
 polly/test/ScopInfo/switch-5.ll               |   4 +-
 polly/test/ScopInfo/switch-6.ll               |   4 +-
 polly/test/ScopInfo/switch-7.ll               |   4 +-
 polly/test/ScopInfo/tempscop-printing.ll      |   2 +-
 .../ScopInfo/test-wrapping-in-condition.ll    |   4 +-
 polly/test/ScopInfo/truncate-1.ll             |   2 +-
 polly/test/ScopInfo/truncate-2.ll             |   2 +-
 polly/test/ScopInfo/truncate-3.ll             |   3 +-
 polly/test/ScopInfo/two-loops-one-infinite.ll |   2 +-
 .../two-loops-right-after-each-other.ll       |   2 +-
 polly/test/ScopInfo/undef_in_cond.ll          |   2 +-
 polly/test/ScopInfo/unnamed_nonaffine.ll      |   4 +-
 polly/test/ScopInfo/unnamed_stmts.ll          |   2 +-
 .../ScopInfo/unpredictable_nonscop_loop.ll    |   2 +-
 .../test/ScopInfo/unprofitable_scalar-accs.ll |   4 +-
 polly/test/ScopInfo/unsigned-condition.ll     |   2 +-
 polly/test/ScopInfo/unsigned-division-1.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-2.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-3.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-4.ll    |   2 +-
 polly/test/ScopInfo/unsigned-division-5.ll    |   2 +-
 polly/test/ScopInfo/unsigned_wrap_uge.ll      |   2 +-
 polly/test/ScopInfo/unsigned_wrap_ugt.ll      |   2 +-
 polly/test/ScopInfo/unsigned_wrap_ule.ll      |   2 +-
 polly/test/ScopInfo/unsigned_wrap_ult.ll      |   2 +-
 polly/test/ScopInfo/user_context.ll           |   8 +-
 ...ed_assumptions-in-bb-signed-conditional.ll |   4 +-
 .../user_provided_assumptions-in-bb-signed.ll |   2 +-
 ...ser_provided_assumptions-in-bb-unsigned.ll |   4 +-
 .../ScopInfo/user_provided_assumptions.ll     |   4 +-
 .../ScopInfo/user_provided_assumptions_2.ll   |   4 +-
 .../ScopInfo/user_provided_assumptions_3.ll   |   4 +-
 ...ser_provided_non_dominating_assumptions.ll |   6 +-
 polly/test/ScopInfo/variant_base_pointer.ll   |   4 +-
 .../ScopInfo/variant_load_empty_domain.ll     |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_0.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_1.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_2.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_3.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_4.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_5.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_6.ll  |   2 +-
 polly/test/ScopInfo/wraping_signed_expr_7.ll  |   2 +-
 .../ScopInfo/wraping_signed_expr_slow_1.ll    |   2 +-
 .../ScopInfo/wraping_signed_expr_slow_2.ll    |   2 +-
 polly/test/ScopInfo/zero_ext_of_truncate.ll   |   2 +-
 polly/test/ScopInfo/zero_ext_of_truncate_2.ll |   2 +-
 .../test/ScopInfo/zero_ext_space_mismatch.ll  |   2 +-
 polly/test/ScopInliner/ignore-declares.ll     |   2 +-
 polly/test/ScopInliner/invariant-load-func.ll |   2 +-
 polly/test/ScopInliner/simple-inline-loop.ll  |   2 +-
 polly/test/Simplify/coalesce_3partials.ll     |   2 +-
 .../Simplify/coalesce_disjointelements.ll     |   2 +-
 polly/test/Simplify/coalesce_overlapping.ll   |   2 +-
 polly/test/Simplify/coalesce_partial.ll       |   2 +-
 polly/test/Simplify/dead_access_load.ll       |   2 +-
 polly/test/Simplify/dead_access_phi.ll        |   2 +-
 polly/test/Simplify/dead_access_value.ll      |   2 +-
 polly/test/Simplify/dead_instruction.ll       |   2 +-
 polly/test/Simplify/emptyaccessdomain.ll      |   2 +-
 polly/test/Simplify/exit_phi_accesses-2.ll    |   2 +-
 polly/test/Simplify/func-b320a7.ll            |   2 +-
 polly/test/Simplify/gemm.ll                   |   2 +-
 .../Simplify/nocoalesce_differentvalues.ll    |   2 +-
 .../Simplify/nocoalesce_elementmismatch.ll    |   2 +-
 polly/test/Simplify/nocoalesce_readbetween.ll |   2 +-
 .../test/Simplify/nocoalesce_writebetween.ll  |   2 +-
 polly/test/Simplify/notdead_region_exitphi.ll |   2 +-
 .../test/Simplify/notdead_region_innerphi.ll  |   2 +-
 .../test/Simplify/notredundant_region_loop.ll |   2 +-
 .../Simplify/notredundant_region_middle.ll    |   2 +-
 .../notredundant_synthesizable_unknownit.ll   |   2 +-
 ...ut-of-scop-use-in-region-entry-phi-node.ll |   2 +-
 polly/test/Simplify/overwritten.ll            |   2 +-
 polly/test/Simplify/overwritten_3phi.ll       |   2 +-
 polly/test/Simplify/overwritten_3store.ll     |   2 +-
 .../overwritten_implicit_and_explicit.ll      |   2 +-
 .../test/Simplify/overwritten_loadbetween.ll  |   2 +-
 polly/test/Simplify/overwritten_scalar.ll     |   2 +-
 polly/test/Simplify/pass_existence.ll         |   2 +-
 polly/test/Simplify/phi_in_regionstmt.ll      |   2 +-
 polly/test/Simplify/pr33323.ll                |   2 +-
 polly/test/Simplify/redundant.ll              |   2 +-
 .../test/Simplify/redundant_differentindex.ll |   2 +-
 polly/test/Simplify/redundant_partialwrite.ll |   2 +-
 polly/test/Simplify/redundant_region.ll       |   2 +-
 .../test/Simplify/redundant_region_scalar.ll  |   2 +-
 polly/test/Simplify/redundant_scalarwrite.ll  |   2 +-
 polly/test/Simplify/redundant_storebetween.ll |   2 +-
 polly/test/Simplify/scalability1.ll           |   2 +-
 polly/test/Simplify/scalability2.ll           |   2 +-
 polly/test/Simplify/sweep_mapped_phi.ll       |   2 +-
 polly/test/Simplify/sweep_mapped_value.ll     |   2 +-
 .../Simplify/ununsed_read_in_region_entry.ll  |   4 +-
 polly/test/Support/Plugins.ll                 |   3 +-
 polly/test/Support/exportjson.ll              |  24 +-
 polly/test/Support/isl-args.ll                |   8 +-
 polly/test/Support/pipelineposition.ll        |   8 +-
 polly/test/lit.site.cfg.in                    |   4 +
 polly/test/polly.ll                           |   2 +-
 1143 files changed, 4460 insertions(+), 2700 deletions(-)
 create mode 100644 polly/include/polly/LinkAllPasses.h
 delete mode 100644 polly/include/polly/Pass/PhaseManager.h
 delete mode 100644 polly/include/polly/Pass/PollyFunctionPass.h
 delete mode 100644 polly/include/polly/Pass/PollyModulePass.h
 delete mode 100644 polly/lib/Pass/PhaseManager.cpp
 delete mode 100644 polly/lib/Pass/PollyFunctionPass.cpp
 delete mode 100644 polly/lib/Pass/PollyModulePass.cpp
 delete mode 100644 polly/test/ScopInfo/licm_potential_store_mssa.ll

diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
index 215a802843304..f5ea47b69cf02 100644
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -13,7 +13,3 @@ In Polly |version| the following important changes have been incorporated.
 
  * ScopInliner has been updated for the New Pass Manager.
 
- * Polly now is a monolithic pass split into phases.
-
- * Polly's support for the legacy pass manager has been removed.
-
diff --git a/polly/include/polly/Canonicalization.h b/polly/include/polly/Canonicalization.h
index 972b660894a1c..03f277e4e91ba 100644
--- a/polly/include/polly/Canonicalization.h
+++ b/polly/include/polly/Canonicalization.h
@@ -11,6 +11,12 @@
 
 #include "llvm/Passes/PassBuilder.h"
 
+namespace llvm {
+namespace legacy {
+class PassManagerBase;
+}
+} // namespace llvm
+
 namespace polly {
 
 /// Schedule a set of canonicalization passes to prepare for Polly.
@@ -20,6 +26,8 @@ namespace polly {
 /// into a canonical form that simplifies the analysis and optimization passes
 /// of Polly. The set of optimization passes scheduled here is probably not yet
 /// optimal. TODO: Optimize the set of canonicalization passes.
+void registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM);
+
 llvm::FunctionPassManager
 buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM,
                                   llvm::OptimizationLevel Level);
diff --git a/polly/include/polly/CodeGen/CodeGeneration.h b/polly/include/polly/CodeGen/CodeGeneration.h
index 2340fbe016b49..57aec1d70cc72 100644
--- a/polly/include/polly/CodeGen/CodeGeneration.h
+++ b/polly/include/polly/CodeGen/CodeGeneration.h
@@ -14,7 +14,6 @@
 #include "llvm/IR/PassManager.h"
 
 namespace polly {
-class IslAstInfo;
 
 enum VectorizerChoice {
   VECTORIZER_NONE,
@@ -34,8 +33,6 @@ struct CodeGenerationPass final : PassInfoMixin<CodeGenerationPass> {
 };
 
 extern bool PerfMonitoring;
-
-bool runCodeGeneration(Scop &S, llvm::RegionInfo &RI, IslAstInfo &AI);
 } // namespace polly
 
 #endif // POLLY_CODEGENERATION_H
diff --git a/polly/include/polly/CodeGen/IslAst.h b/polly/include/polly/CodeGen/IslAst.h
index 3e1ff2c8a24da..c99a4957d6b48 100644
--- a/polly/include/polly/CodeGen/IslAst.h
+++ b/polly/include/polly/CodeGen/IslAst.h
@@ -21,7 +21,6 @@
 #ifndef POLLY_ISLAST_H
 #define POLLY_ISLAST_H
 
-#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/PassManager.h"
@@ -173,6 +172,33 @@ struct IslAstAnalysis : AnalysisInfoMixin<IslAstAnalysis> {
                  ScopStandardAnalysisResults &SAR);
 };
 
+class IslAstInfoWrapperPass final : public ScopPass {
+  std::unique_ptr<IslAstInfo> Ast;
+
+public:
+  static char ID;
+
+  IslAstInfoWrapperPass() : ScopPass(ID) {}
+
+  IslAstInfo &getAI() { return *Ast; }
+  const IslAstInfo &getAI() const { return *Ast; }
+
+  /// Build the AST for the given SCoP @p S.
+  bool runOnScop(Scop &S) override;
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Release the internal memory.
+  void releaseMemory() override;
+
+  /// Print a source code representation of the program.
+  void printScop(raw_ostream &OS, Scop &S) const override;
+};
+
+llvm::Pass *createIslAstInfoWrapperPassPass();
+llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
+
 struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> {
   IslAstPrinterPass(raw_ostream &OS) : OS(OS) {}
 
@@ -181,9 +207,11 @@ struct IslAstPrinterPass final : PassInfoMixin<IslAstPrinterPass> {
 
   raw_ostream &OS;
 };
-
-std::unique_ptr<IslAstInfo> runIslAstGen(Scop &S,
-                                         DependenceAnalysis::Result &DA);
 } // namespace polly
 
+namespace llvm {
+void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
+void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif // POLLY_ISLAST_H
diff --git a/polly/include/polly/CodePreparation.h b/polly/include/polly/CodePreparation.h
index 1a15e3d4d5a29..c6bc526db209d 100644
--- a/polly/include/polly/CodePreparation.h
+++ b/polly/include/polly/CodePreparation.h
@@ -15,12 +15,6 @@
 
 #include "llvm/IR/PassManager.h"
 
-namespace llvm {
-class DominatorTree;
-class LoopInfo;
-class RegionInfo;
-} // namespace llvm
-
 namespace polly {
 struct CodePreparationPass final : llvm::PassInfoMixin<CodePreparationPass> {
   llvm::PreservedAnalyses run(llvm::Function &F,
diff --git a/polly/include/polly/DeLICM.h b/polly/include/polly/DeLICM.h
index 63fc509e0bd46..0e03c04079480 100644
--- a/polly/include/polly/DeLICM.h
+++ b/polly/include/polly/DeLICM.h
@@ -21,10 +21,15 @@
 #include "isl/isl-noexceptions.h"
 
 namespace llvm {
+class PassRegistry;
+class Pass;
 class raw_ostream;
 } // namespace llvm
 
 namespace polly {
+/// Create a new DeLICM pass instance.
+llvm::Pass *createDeLICMWrapperPass();
+llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS);
 
 struct DeLICMPass final : llvm::PassInfoMixin<DeLICMPass> {
   DeLICMPass() {}
@@ -54,7 +59,11 @@ bool isConflicting(isl::union_set ExistingOccupied,
                    isl::union_map ProposedWrites,
                    llvm::raw_ostream *OS = nullptr, unsigned Indent = 0);
 
-bool runDeLICM(Scop &S);
 } // namespace polly
 
+namespace llvm {
+void initializeDeLICMWrapperPassPass(llvm::PassRegistry &);
+void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_DELICM_H */
diff --git a/polly/include/polly/DeadCodeElimination.h b/polly/include/polly/DeadCodeElimination.h
index 4d8da56c76eec..d416afa030c56 100644
--- a/polly/include/polly/DeadCodeElimination.h
+++ b/polly/include/polly/DeadCodeElimination.h
@@ -13,10 +13,16 @@
 #ifndef POLLY_DEADCODEELIMINATION_H
 #define POLLY_DEADCODEELIMINATION_H
 
-#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 
+namespace llvm {
+class PassRegistry;
+class Pass;
+class raw_ostream;
+} // namespace llvm
+
 namespace polly {
+llvm::Pass *createDeadCodeElimWrapperPass();
 
 struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> {
   DeadCodeElimPass() {}
@@ -25,7 +31,10 @@ struct DeadCodeElimPass final : llvm::PassInfoMixin<DeadCodeElimPass> {
                               ScopStandardAnalysisResults &SAR, SPMUpdater &U);
 };
 
-bool runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA);
 } // namespace polly
 
+namespace llvm {
+void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_DEADCODEELIMINATION_H */
diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h
index 88ea468dd5473..d562ad80592f2 100644
--- a/polly/include/polly/DependenceInfo.h
+++ b/polly/include/polly/DependenceInfo.h
@@ -145,6 +145,7 @@ class Dependences final {
   friend struct DependenceAnalysis;
   friend struct DependenceInfoPrinterPass;
   friend class DependenceInfo;
+  friend class DependenceInfoWrapperPass;
 
   /// Destructor that will free internal objects.
   ~Dependences() { releaseMemory(); }
@@ -191,8 +192,6 @@ class Dependences final {
   const AnalysisLevel Level;
 };
 
-extern Dependences::AnalysisLevel OptAnalysisLevel;
-
 struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> {
   static AnalysisKey Key;
   struct Result {
@@ -233,7 +232,108 @@ struct DependenceInfoPrinterPass final
   raw_ostream &OS;
 };
 
-DependenceAnalysis::Result runDependenceAnalysis(Scop &S);
+class DependenceInfo final : public ScopPass {
+public:
+  static char ID;
+
+  /// Construct a new DependenceInfo pass.
+  DependenceInfo() : ScopPass(ID) {}
+
+  /// Return the dependence information for the current SCoP.
+  ///
+  /// @param Level The granularity of dependence analysis result.
+  ///
+  /// @return The dependence analysis result
+  ///
+  const Dependences &getDependences(Dependences::AnalysisLevel Level);
+
+  /// Recompute dependences from schedule and memory accesses.
+  const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);
+
+  /// Invalidate the dependence information and recompute it when needed again.
+  /// May be required when the underlying Scop was changed in a way that would
+  /// add new dependencies (e.g. between new statement instances insierted into
+  /// the SCoP) or intentionally breaks existing ones. It is not required when
+  /// updating the schedule that conforms the existing dependencies.
+  void abandonDependences();
+
+  /// Compute the dependence information for the SCoP @p S.
+  bool runOnScop(Scop &S) override;
+
+  /// Print the dependences for the given SCoP to @p OS.
+  void printScop(raw_ostream &OS, Scop &) const override;
+
+  /// Release the internal memory.
+  void releaseMemory() override {
+    for (auto &d : D)
+      d.reset();
+  }
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  Scop *S;
+
+  /// Dependences struct for the current SCoP.
+  std::unique_ptr<Dependences> D[Dependences::NumAnalysisLevels];
+};
+
+llvm::Pass *createDependenceInfoPass();
+llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS);
+
+/// Construct a new DependenceInfoWrapper pass.
+class DependenceInfoWrapperPass final : public FunctionPass {
+public:
+  static char ID;
+
+  /// Construct a new DependenceInfoWrapper pass.
+  DependenceInfoWrapperPass() : FunctionPass(ID) {}
+
+  /// Return the dependence information for the given SCoP.
+  ///
+  /// @param S     SCoP object.
+  /// @param Level The granularity of dependence analysis result.
+  ///
+  /// @return The dependence analysis result
+  ///
+  const Dependences &getDependences(Scop *S, Dependences::AnalysisLevel Level);
+
+  /// Recompute dependences from schedule and memory accesses.
+  const Dependences &recomputeDependences(Scop *S,
+                                          Dependences::AnalysisLevel Level);
+
+  /// Compute the dependence information on-the-fly for the function.
+  bool runOnFunction(Function &F) override;
+
+  /// Print the dependences for the current function to @p OS.
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  /// Release the internal memory.
+  void releaseMemory() override { ScopToDepsMap.clear(); }
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  using ScopToDepsMapTy = DenseMap<Scop *, std::unique_ptr<Dependences>>;
+
+  /// Scop to Dependence map for the current function.
+  ScopToDepsMapTy ScopToDepsMap;
+};
+
+llvm::Pass *createDependenceInfoWrapperPassPass();
+llvm::Pass *
+createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
+
 } // namespace polly
 
+namespace llvm {
+void initializeDependenceInfoPass(llvm::PassRegistry &);
+void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &);
+void initializeDependenceInfoPrinterLegacyFunctionPassPass(
+    llvm::PassRegistry &);
+} // namespace llvm
+
 #endif
diff --git a/polly/include/polly/FlattenSchedule.h b/polly/include/polly/FlattenSchedule.h
index 154344d2f5c3e..3ef3c304243df 100644
--- a/polly/include/polly/FlattenSchedule.h
+++ b/polly/include/polly/FlattenSchedule.h
@@ -15,10 +15,20 @@
 #ifndef POLLY_FLATTENSCHEDULE_H
 #define POLLY_FLATTENSCHEDULE_H
 
-namespace polly {
-class Scop;
+namespace llvm {
+class PassRegistry;
+class Pass;
+class raw_ostream;
+} // namespace llvm
 
-void runFlattenSchedulePass(Scop &S);
+namespace polly {
+llvm::Pass *createFlattenSchedulePass();
+llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS);
 } // namespace polly
 
+namespace llvm {
+void initializeFlattenSchedulePass(llvm::PassRegistry &);
+void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_FLATTENSCHEDULE_H */
diff --git a/polly/include/polly/ForwardOpTree.h b/polly/include/polly/ForwardOpTree.h
index 8b2ece1f08e15..b5da0f513ab78 100644
--- a/polly/include/polly/ForwardOpTree.h
+++ b/polly/include/polly/ForwardOpTree.h
@@ -15,7 +15,13 @@
 
 #include "polly/ScopPass.h"
 
+namespace llvm {
+class PassRegistry;
+} // namespace llvm
+
 namespace polly {
+llvm::Pass *createForwardOpTreeWrapperPass();
+llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS);
 
 struct ForwardOpTreePass final : llvm::PassInfoMixin<ForwardOpTreePass> {
   ForwardOpTreePass() {}
@@ -35,15 +41,11 @@ struct ForwardOpTreePrinterPass final
   llvm::raw_ostream &OS;
 };
 
-/// Pass that redirects scalar reads to array elements that are known to contain
-/// the same value.
-///
-/// This reduces the number of scalar accesses and therefore potentially
-/// increases the freedom of the scheduler. In the ideal case, all reads of a
-/// scalar definition are redirected (We currently do not care about removing
-/// the write in this case).  This is also useful for the main DeLICM pass as
-/// there are less scalars to be mapped.
-bool runForwardOpTree(Scop &S);
 } // namespace polly
 
+namespace llvm {
+void initializeForwardOpTreeWrapperPassPass(PassRegistry &);
+void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &);
+} // namespace llvm
+
 #endif // POLLY_FORWARDOPTREE_H
diff --git a/polly/include/polly/JSONExporter.h b/polly/include/polly/JSONExporter.h
index 82a881c737064..958f95ea11404 100644
--- a/polly/include/polly/JSONExporter.h
+++ b/polly/include/polly/JSONExporter.h
@@ -9,11 +9,13 @@
 #ifndef POLLY_JSONEXPORTER_H
 #define POLLY_JSONEXPORTER_H
 
-#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 #include "llvm/IR/PassManager.h"
 
 namespace polly {
+llvm::Pass *createJSONExporterPass();
+llvm::Pass *createJSONImporterPass();
+llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS);
 
 /// This pass exports a scop to a jscop file. The filename is generated from the
 /// concatenation of the function and scop name.
@@ -28,9 +30,12 @@ struct JSONImportPass final : llvm::PassInfoMixin<JSONExportPass> {
   llvm::PreservedAnalyses run(Scop &, ScopAnalysisManager &,
                               ScopStandardAnalysisResults &, SPMUpdater &);
 };
-
-void runImportJSON(Scop &S, DependenceAnalysis::Result &DA);
-void runExportJSON(Scop &S);
 } // namespace polly
 
+namespace llvm {
+void initializeJSONExporterPass(llvm::PassRegistry &);
+void initializeJSONImporterPass(llvm::PassRegistry &);
+void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_JSONEXPORTER_H */
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
new file mode 100644
index 0000000000000..9978344c73e9f
--- /dev/null
+++ b/polly/include/polly/LinkAllPasses.h
@@ -0,0 +1,156 @@
+//===- polly/LinkAllPasses.h ----------- Reference All Passes ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file pulls in all transformation and analysis passes for tools
+// like opt and bugpoint that need this functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_LINKALLPASSES_H
+#define POLLY_LINKALLPASSES_H
+
+#include "polly/Config/config.h"
+#include "polly/Support/DumpFunctionPass.h"
+#include "polly/Support/DumpModulePass.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/AlwaysTrue.h"
+
+namespace llvm {
+class Pass;
+class PassRegistry;
+} // namespace llvm
+
+namespace polly {
+llvm::Pass *createCodePreparationPass();
+llvm::Pass *createScopInlinerPass();
+llvm::Pass *createDeadCodeElimWrapperPass();
+llvm::Pass *createDependenceInfoPass();
+llvm::Pass *createDependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createDependenceInfoWrapperPassPass();
+llvm::Pass *
+createDependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
+llvm::Pass *createDOTOnlyPrinterWrapperPass();
+llvm::Pass *createDOTOnlyViewerWrapperPass();
+llvm::Pass *createDOTPrinterWrapperPass();
+llvm::Pass *createDOTViewerWrapperPass();
+llvm::Pass *createJSONExporterPass();
+llvm::Pass *createJSONImporterPass();
+llvm::Pass *createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createPollyCanonicalizePass();
+llvm::Pass *createScopDetectionWrapperPassPass();
+llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createScopInfoRegionPassPass();
+llvm::Pass *createScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS);
+llvm::Pass *createScopInfoWrapperPassPass();
+llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
+llvm::Pass *createIslAstInfoWrapperPassPass();
+llvm::Pass *createIslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createCodeGenerationPass();
+llvm::Pass *createIslScheduleOptimizerWrapperPass();
+llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createFlattenSchedulePass();
+llvm::Pass *createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createForwardOpTreeWrapperPass();
+llvm::Pass *createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createDeLICMWrapperPass();
+llvm::Pass *createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createMaximalStaticExpansionPass();
+llvm::Pass *createSimplifyWrapperPass(int);
+llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS);
+llvm::Pass *createPruneUnprofitableWrapperPass();
+
+extern char &CodePreparationID;
+} // namespace polly
+
+namespace {
+struct PollyForcePassLinking {
+  PollyForcePassLinking() {
+    // We must reference the passes in such a way that compilers will not delete
+    // it all as dead code, even with whole program optimization, yet is
+    // effectively a NO-OP.
+    if (llvm::getNonFoldableAlwaysTrue())
+      return;
+
+    polly::createCodePreparationPass();
+    polly::createDeadCodeElimWrapperPass();
+    polly::createDependenceInfoPass();
+    polly::createDependenceInfoPrinterLegacyPass(llvm::outs());
+    polly::createDependenceInfoWrapperPassPass();
+    polly::createDependenceInfoPrinterLegacyFunctionPass(llvm::outs());
+    polly::createDOTOnlyPrinterWrapperPass();
+    polly::createDOTOnlyViewerWrapperPass();
+    polly::createDOTPrinterWrapperPass();
+    polly::createDOTViewerWrapperPass();
+    polly::createJSONExporterPass();
+    polly::createJSONImporterPass();
+    polly::createJSONImporterPrinterLegacyPass(llvm::outs());
+    polly::createScopDetectionWrapperPassPass();
+    polly::createScopDetectionPrinterLegacyPass(llvm::outs());
+    polly::createScopInfoRegionPassPass();
+    polly::createScopInfoPrinterLegacyRegionPass(llvm::outs());
+    polly::createScopInfoWrapperPassPass();
+    polly::createScopInfoPrinterLegacyFunctionPass(llvm::outs());
+    polly::createPollyCanonicalizePass();
+    polly::createIslAstInfoWrapperPassPass();
+    polly::createIslAstInfoPrinterLegacyPass(llvm::outs());
+    polly::createCodeGenerationPass();
+    polly::createIslScheduleOptimizerWrapperPass();
+    polly::createIslScheduleOptimizerPrinterLegacyPass(llvm::outs());
+    polly::createMaximalStaticExpansionPass();
+    polly::createFlattenSchedulePass();
+    polly::createFlattenSchedulePrinterLegacyPass(llvm::errs());
+    polly::createForwardOpTreeWrapperPass();
+    polly::createForwardOpTreePrinterLegacyPass(llvm::errs());
+    polly::createDeLICMWrapperPass();
+    polly::createDeLICMPrinterLegacyPass(llvm::outs());
+    polly::createDumpModuleWrapperPass("", true);
+    polly::createDumpFunctionWrapperPass("");
+    polly::createSimplifyWrapperPass(0);
+    polly::createSimplifyPrinterLegacyPass(llvm::outs());
+    polly::createPruneUnprofitableWrapperPass();
+  }
+} PollyForcePassLinking; // Force link by creating a global definition.
+} // namespace
+
+namespace llvm {
+void initializeCodePreparationPass(llvm::PassRegistry &);
+void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
+void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &);
+void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeScopInfoRegionPassPass(PassRegistry &);
+void initializeScopInfoPrinterLegacyRegionPassPass(llvm::PassRegistry &);
+void initializeScopInfoWrapperPassPass(PassRegistry &);
+void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &);
+void initializeDeadCodeElimWrapperPassPass(llvm::PassRegistry &);
+void initializeJSONExporterPass(llvm::PassRegistry &);
+void initializeJSONImporterPass(llvm::PassRegistry &);
+void initializeJSONImporterPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeDependenceInfoPass(llvm::PassRegistry &);
+void initializeDependenceInfoPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeDependenceInfoWrapperPassPass(llvm::PassRegistry &);
+void initializeDependenceInfoPrinterLegacyFunctionPassPass(
+    llvm::PassRegistry &);
+void initializeIslAstInfoWrapperPassPass(llvm::PassRegistry &);
+void initializeIslAstInfoPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeCodeGenerationPass(llvm::PassRegistry &);
+void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
+void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeMaximalStaticExpanderWrapperPassPass(llvm::PassRegistry &);
+void initializePollyCanonicalizePass(llvm::PassRegistry &);
+void initializeFlattenSchedulePass(llvm::PassRegistry &);
+void initializeFlattenSchedulePrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeForwardOpTreeWrapperPassPass(llvm::PassRegistry &);
+void initializeForwardOpTreePrinterLegacyPassPass(PassRegistry &);
+void initializeDeLICMWrapperPassPass(llvm::PassRegistry &);
+void initializeDeLICMPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializeSimplifyWrapperPassPass(llvm::PassRegistry &);
+void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &);
+void initializePruneUnprofitableWrapperPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
+#endif
diff --git a/polly/include/polly/MaximalStaticExpansion.h b/polly/include/polly/MaximalStaticExpansion.h
index 1f9fbcb1d6a70..88827b2700887 100644
--- a/polly/include/polly/MaximalStaticExpansion.h
+++ b/polly/include/polly/MaximalStaticExpansion.h
@@ -14,7 +14,6 @@
 #ifndef POLLY_MAXIMALSTATICEXPANSION_H
 #define POLLY_MAXIMALSTATICEXPANSION_H
 
-#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 #include "llvm/IR/PassManager.h"
 
@@ -38,7 +37,6 @@ struct MaximalStaticExpansionPrinterPass
   llvm::raw_ostream &OS;
 };
 
-void runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI);
 } // namespace polly
 
 #endif /* POLLY_MAXIMALSTATICEXPANSION_H */
diff --git a/polly/include/polly/Pass/PhaseManager.h b/polly/include/polly/Pass/PhaseManager.h
deleted file mode 100644
index 9ff9bbf02d71f..0000000000000
--- a/polly/include/polly/Pass/PhaseManager.h
+++ /dev/null
@@ -1,127 +0,0 @@
-//===------ PhaseManager.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements the sequence of operations on SCoPs, called phases. It is itelf
-// not a pass in either pass manager, but used from PollyFunctionPass or
-// PollyModulePass.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PASS_PHASEMANAGER_H_
-#define POLLY_PASS_PHASEMANAGER_H_
-
-#include "polly/DependenceInfo.h"
-#include "llvm/ADT/Bitset.h"
-#include <stddef.h>
-
-namespace llvm {
-class Function;
-class Error;
-} // namespace llvm
-
-namespace polly {
-
-/// Phases (in execution order) within the Polly pass.
-enum class PassPhase {
-  None,
-
-  Prepare,
-
-  Detection,
-  PrintDetect,
-  DotScops,
-  DotScopsOnly,
-  ViewScops,
-  ViewScopsOnly,
-
-  ScopInfo,
-  PrintScopInfo,
-
-  Flatten,
-
-  Dependences,
-  PrintDependences,
-
-  ImportJScop,
-  Simplify0,
-  Optree,
-  DeLICM,
-  Simplify1,
-  DeadCodeElimination,
-  MaximumStaticExtension,
-  PruneUnprofitable,
-  Optimization,
-  ExportJScop,
-  AstGen,
-  CodeGen,
-
-  PassPhaseFirst = Prepare,
-  PassPhaseLast = CodeGen
-};
-
-StringRef getPhaseName(PassPhase Phase);
-PassPhase parsePhase(StringRef Name);
-bool dependsOnDependenceInfo(PassPhase Phase);
-
-/// Options for the Polly pass.
-class PollyPassOptions {
-  /// For each Polly phase, whether it should be executed.
-  /// Since PassPhase::None is unused, bit positions are shifted by one.
-  llvm::Bitset<static_cast<size_t>(PassPhase::PassPhaseLast) -
-               static_cast<size_t>(PassPhase::PassPhaseFirst) + 1>
-      PhaseEnabled;
-
-public:
-  bool ViewAll = false;
-  std::string ViewFilter;
-  Dependences::AnalysisLevel PrintDepsAnalysisLevel = Dependences::AL_Statement;
-
-  bool isPhaseEnabled(PassPhase Phase) const {
-    assert(Phase != PassPhase::None);
-    unsigned BitPos = static_cast<size_t>(Phase) -
-                      static_cast<size_t>(PassPhase::PassPhaseFirst);
-    return PhaseEnabled[BitPos];
-  }
-
-  void setPhaseEnabled(PassPhase Phase, bool Enabled = true) {
-    assert(Phase != PassPhase::None);
-    unsigned BitPos = static_cast<size_t>(Phase) -
-                      static_cast<size_t>(PassPhase::PassPhaseFirst);
-    if (Enabled)
-      PhaseEnabled.set(BitPos);
-    else
-      PhaseEnabled.reset(BitPos);
-  }
-
-  /// Enable all phases that are necessary for a roundtrip from LLVM-IR back to
-  /// LLVM-IR.
-  void enableEnd2End();
-
-  /// Enabled the default optimization phases.
-  void enableDefaultOpts();
-
-  /// Disable all phases following \p Phase.
-  /// Useful when regression testing that particular phase and everything after
-  /// it is not of interest.
-  void disableAfter(PassPhase Phase);
-
-  /// Check whether the options are coherent relative to each other.
-  llvm::Error checkConsistency() const;
-};
-
-/// Run Polly and its phases on \p F.
-bool runPollyPass(Function &F, llvm::FunctionAnalysisManager &FAM,
-                  PollyPassOptions Opts);
-} // namespace polly
-
-/// Make llvm::enum_seq<PassPhase> work.
-template <> struct llvm::enum_iteration_traits<polly::PassPhase> {
-  static constexpr bool is_iterable = true;
-};
-
-#endif /* POLLY_PASS_PHASEMANAGER_H_ */
diff --git a/polly/include/polly/Pass/PollyFunctionPass.h b/polly/include/polly/Pass/PollyFunctionPass.h
deleted file mode 100644
index dd0d4e77d7a80..0000000000000
--- a/polly/include/polly/Pass/PollyFunctionPass.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===------ PollyFunctionPass.h - Polly function pass ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PASS_POLLYFUNCTIONPASS_H_
-#define POLLY_PASS_POLLYFUNCTIONPASS_H_
-
-#include "polly/Pass/PhaseManager.h"
-#include "llvm/IR/Analysis.h"
-#include "llvm/IR/PassManager.h"
-#include <utility>
-
-namespace polly {
-
-class PollyFunctionPass : public llvm::PassInfoMixin<PollyFunctionPass> {
-public:
-  PollyFunctionPass() {}
-  PollyFunctionPass(PollyPassOptions Opts) : Opts(std::move(Opts)) {}
-
-  llvm::PreservedAnalyses run(llvm::Function &F,
-                              llvm::FunctionAnalysisManager &);
-
-private:
-  PollyPassOptions Opts;
-};
-} // namespace polly
-
-#endif /* POLLY_PASS_POLLYFUNCTIONPASS_H_ */
diff --git a/polly/include/polly/Pass/PollyModulePass.h b/polly/include/polly/Pass/PollyModulePass.h
deleted file mode 100644
index 2214bbf3d143e..0000000000000
--- a/polly/include/polly/Pass/PollyModulePass.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===------ PollyModulePass.h - Polly module pass -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef POLLY_PASS_POLLYMODULEPASS_H_
-#define POLLY_PASS_POLLYMODULEPASS_H_
-
-#include "polly/Pass/PhaseManager.h"
-#include "llvm/IR/PassManager.h"
-
-namespace polly {
-
-class PollyModulePass : public llvm::PassInfoMixin<PollyModulePass> {
-public:
-  PollyModulePass() {}
-  PollyModulePass(PollyPassOptions Opts) : Opts(std::move(Opts)) {}
-
-  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &);
-
-private:
-  PollyPassOptions Opts;
-};
-
-} // namespace polly
-
-#endif /* POLLY_PASS_POLLYMODULEPASS_H_ */
diff --git a/polly/include/polly/PruneUnprofitable.h b/polly/include/polly/PruneUnprofitable.h
index 16b76cc62f1d2..2d285cce69ad4 100644
--- a/polly/include/polly/PruneUnprofitable.h
+++ b/polly/include/polly/PruneUnprofitable.h
@@ -15,7 +15,13 @@
 
 #include "polly/ScopPass.h"
 
+namespace llvm {
+class Pass;
+class PassRegistry;
+} // namespace llvm
+
 namespace polly {
+llvm::Pass *createPruneUnprofitableWrapperPass();
 
 struct PruneUnprofitablePass final
     : llvm::PassInfoMixin<PruneUnprofitablePass> {
@@ -24,8 +30,10 @@ struct PruneUnprofitablePass final
   llvm::PreservedAnalyses run(Scop &S, ScopAnalysisManager &SAM,
                               ScopStandardAnalysisResults &SAR, SPMUpdater &U);
 };
-
-bool runPruneUnprofitable(Scop &S);
 } // namespace polly
 
+namespace llvm {
+void initializePruneUnprofitableWrapperPassPass(PassRegistry &);
+}
+
 #endif // POLLY_PRUNEUNPROFITABLE_H
diff --git a/polly/include/polly/RegisterPasses.h b/polly/include/polly/RegisterPasses.h
index 7819462cb0c36..3a81e1ba7487d 100644
--- a/polly/include/polly/RegisterPasses.h
+++ b/polly/include/polly/RegisterPasses.h
@@ -14,6 +14,7 @@
 #define POLLY_REGISTER_PASSES_H
 
 namespace llvm {
+class PassRegistry;
 class PassBuilder;
 struct PassPluginLibraryInfo;
 namespace legacy {
@@ -22,6 +23,7 @@ class PassManagerBase;
 } // namespace llvm
 
 namespace polly {
+void initializePollyPasses(llvm::PassRegistry &Registry);
 void registerPollyPasses(llvm::PassBuilder &PB);
 } // namespace polly
 
diff --git a/polly/include/polly/ScheduleOptimizer.h b/polly/include/polly/ScheduleOptimizer.h
index ac45572ba7ed5..3e17eeff49ae3 100644
--- a/polly/include/polly/ScheduleOptimizer.h
+++ b/polly/include/polly/ScheduleOptimizer.h
@@ -9,10 +9,16 @@
 #ifndef POLLY_SCHEDULEOPTIMIZER_H
 #define POLLY_SCHEDULEOPTIMIZER_H
 
-#include "polly/DependenceInfo.h"
 #include "polly/ScopPass.h"
 
+namespace llvm {
+class Pass;
+class PassRegistry;
+} // namespace llvm
+
 namespace polly {
+llvm::Pass *createIslScheduleOptimizerWrapperPass();
+llvm::Pass *createIslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS);
 
 struct IslScheduleOptimizerPass final
     : llvm::PassInfoMixin<IslScheduleOptimizerPass> {
@@ -32,9 +38,11 @@ struct IslScheduleOptimizerPrinterPass final
 private:
   llvm::raw_ostream &OS;
 };
-
-void runIslScheduleOptimizer(Scop &S, llvm::TargetTransformInfo *TTI,
-                             DependenceAnalysis::Result &Deps);
 } // namespace polly
 
+namespace llvm {
+void initializeIslScheduleOptimizerWrapperPassPass(llvm::PassRegistry &);
+void initializeIslScheduleOptimizerPrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif // POLLY_SCHEDULEOPTIMIZER_H
diff --git a/polly/include/polly/ScopDetection.h b/polly/include/polly/ScopDetection.h
index ded1c88206430..5759f75463284 100644
--- a/polly/include/polly/ScopDetection.h
+++ b/polly/include/polly/ScopDetection.h
@@ -52,6 +52,7 @@
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Pass.h"
 #include <set>
 
 namespace polly {
@@ -67,6 +68,7 @@ using llvm::DenseMap;
 using llvm::DominatorTree;
 using llvm::Function;
 using llvm::FunctionAnalysisManager;
+using llvm::FunctionPass;
 using llvm::IntrinsicInst;
 using llvm::LoopInfo;
 using llvm::Module;
@@ -629,6 +631,31 @@ struct ScopAnalysisPrinterPass final : PassInfoMixin<ScopAnalysisPrinterPass> {
 
   raw_ostream &OS;
 };
+
+class ScopDetectionWrapperPass final : public FunctionPass {
+  std::unique_ptr<ScopDetection> Result;
+
+public:
+  ScopDetectionWrapperPass();
+
+  /// @name FunctionPass interface
+  ///@{
+  static char ID;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  bool runOnFunction(Function &F) override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+  ///@}
+
+  ScopDetection &getSD() const { return *Result; }
+};
+
+llvm::Pass *createScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS);
 } // namespace polly
 
+namespace llvm {
+void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &);
+void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif // POLLY_SCOPDETECTION_H
diff --git a/polly/include/polly/ScopGraphPrinter.h b/polly/include/polly/ScopGraphPrinter.h
index c4e669f0c3503..b57732ad3d70d 100644
--- a/polly/include/polly/ScopGraphPrinter.h
+++ b/polly/include/polly/ScopGraphPrinter.h
@@ -70,9 +70,6 @@ struct DOTGraphTraits<polly::ScopDetection *> : DOTGraphTraits<RegionNode *> {
 
 namespace polly {
 
-extern std::string ViewFilter;
-extern bool ViewAll;
-
 struct ScopViewer final : llvm::DOTGraphTraitsViewer<ScopAnalysis, false> {
   ScopViewer() : llvm::DOTGraphTraitsViewer<ScopAnalysis, false>("scops") {}
 
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 7541ddc21e39f..f700144165d53 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -23,11 +23,13 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
 #include "isl/isl-noexceptions.h"
 #include <cassert>
 #include <cstddef>
@@ -53,6 +55,8 @@ using llvm::MemIntrinsic;
 using llvm::PassInfoMixin;
 using llvm::PHINode;
 using llvm::RegionNode;
+using llvm::RegionPass;
+using llvm::RGPassManager;
 using llvm::SetVector;
 using llvm::SmallPtrSetImpl;
 using llvm::SmallVector;
@@ -2670,6 +2674,39 @@ class Scop final {
 /// Print Scop scop to raw_ostream OS.
 raw_ostream &operator<<(raw_ostream &OS, const Scop &scop);
 
+/// The legacy pass manager's analysis pass to compute scop information
+///        for a region.
+class ScopInfoRegionPass final : public RegionPass {
+  /// The Scop pointer which is used to construct a Scop.
+  std::unique_ptr<Scop> S;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  ScopInfoRegionPass() : RegionPass(ID) {}
+  ~ScopInfoRegionPass() override = default;
+
+  /// Build Scop object, the Polly IR of static control
+  ///        part for the current SESE-Region.
+  ///
+  /// @return If the current region is a valid for a static control part,
+  ///         return the Polly IR representing this static control part,
+  ///         return null otherwise.
+  Scop *getScop() { return S.get(); }
+  const Scop *getScop() const { return S.get(); }
+
+  /// Calculate the polyhedral scop information for a given Region.
+  bool runOnRegion(Region *R, RGPassManager &RGM) override;
+
+  void releaseMemory() override { S.reset(); }
+
+  void print(raw_ostream &O, const Module *M = nullptr) const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+llvm::Pass *createScopInfoPrinterLegacyRegionPass(raw_ostream &OS);
+
 class ScopInfo {
 public:
   using RegionToScopMapTy = MapVector<Region *, std::unique_ptr<Scop>>;
@@ -2744,6 +2781,45 @@ struct ScopInfoPrinterPass final : PassInfoMixin<ScopInfoPrinterPass> {
 
   raw_ostream &Stream;
 };
+
+//===----------------------------------------------------------------------===//
+/// The legacy pass manager's analysis pass to compute scop information
+///        for the whole function.
+///
+/// This pass will maintain a map of the maximal region within a scop to its
+/// scop object for all the feasible scops present in a function.
+/// This pass is an alternative to the ScopInfoRegionPass in order to avoid a
+/// region pass manager.
+class ScopInfoWrapperPass final : public FunctionPass {
+  std::unique_ptr<ScopInfo> Result;
+
+public:
+  ScopInfoWrapperPass() : FunctionPass(ID) {}
+  ~ScopInfoWrapperPass() override = default;
+
+  static char ID; // Pass identification, replacement for typeid
+
+  ScopInfo *getSI() { return Result.get(); }
+  const ScopInfo *getSI() const { return Result.get(); }
+
+  /// Calculate all the polyhedral scops for a given function.
+  bool runOnFunction(Function &F) override;
+
+  void releaseMemory() override { Result.reset(); }
+
+  void print(raw_ostream &O, const Module *M = nullptr) const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+llvm::Pass *createScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS);
 } // end namespace polly
 
+namespace llvm {
+void initializeScopInfoRegionPassPass(PassRegistry &);
+void initializeScopInfoPrinterLegacyRegionPassPass(PassRegistry &);
+void initializeScopInfoWrapperPassPass(PassRegistry &);
+void initializeScopInfoPrinterLegacyFunctionPassPass(PassRegistry &);
+} // end namespace llvm
+
 #endif // POLLY_SCOPINFO_H
diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h
index ae1938f03ac70..014667804330f 100644
--- a/polly/include/polly/ScopInliner.h
+++ b/polly/include/polly/ScopInliner.h
@@ -23,6 +23,12 @@ class ScopInlinerPass : public llvm::PassInfoMixin<ScopInlinerPass> {
                               llvm::LazyCallGraph &CG,
                               llvm::CGSCCUpdateResult &UR);
 };
+
+llvm::Pass *createScopInlinerWrapperPass();
 } // namespace polly
 
+namespace llvm {
+void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
+}
+
 #endif /* POLLY_POLLYINLINER_H */
diff --git a/polly/include/polly/ScopPass.h b/polly/include/polly/ScopPass.h
index 80ccd5717f96c..144cfd1364393 100644
--- a/polly/include/polly/ScopPass.h
+++ b/polly/include/polly/ScopPass.h
@@ -19,6 +19,7 @@
 
 #include "polly/ScopInfo.h"
 #include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/Analysis/RegionPass.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PassManagerImpl.h"
@@ -154,6 +155,33 @@ using ScopPassManager =
     PassManager<Scop, ScopAnalysisManager, ScopStandardAnalysisResults &,
                 SPMUpdater &>;
 
+/// ScopPass - This class adapts the RegionPass interface to allow convenient
+/// creation of passes that operate on the Polly IR. Instead of overriding
+/// runOnRegion, subclasses override runOnScop.
+class ScopPass : public RegionPass {
+  Scop *S;
+
+protected:
+  explicit ScopPass(char &ID) : RegionPass(ID), S(nullptr) {}
+
+  /// runOnScop - This method must be overloaded to perform the
+  /// desired Polyhedral transformation or analysis.
+  ///
+  virtual bool runOnScop(Scop &S) = 0;
+
+  /// Print method for SCoPs.
+  virtual void printScop(raw_ostream &OS, Scop &S) const {}
+
+  /// getAnalysisUsage - Subclasses that override getAnalysisUsage
+  /// must call this.
+  ///
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  bool runOnRegion(Region *R, RGPassManager &RGM) override;
+  void print(raw_ostream &OS, const Module *) const override;
+};
+
 struct ScopStandardAnalysisResults {
   DominatorTree &DT;
   ScopInfo &SI;
diff --git a/polly/include/polly/Simplify.h b/polly/include/polly/Simplify.h
index 4565eb26edaf0..b2aa58d850fae 100644
--- a/polly/include/polly/Simplify.h
+++ b/polly/include/polly/Simplify.h
@@ -16,6 +16,11 @@
 #include "polly/ScopPass.h"
 #include "llvm/ADT/SmallVector.h"
 
+namespace llvm {
+class PassRegistry;
+class Pass;
+} // namespace llvm
+
 namespace polly {
 class MemoryAccess;
 class ScopStmt;
@@ -36,6 +41,17 @@ class ScopStmt;
 ///   undefined.
 llvm::SmallVector<MemoryAccess *, 32> getAccessesInOrder(ScopStmt &Stmt);
 
+/// Create a Simplify pass
+///
+/// @param CallNo Disambiguates this instance for when there are multiple
+///               instances of this pass in the pass manager. It is used only to
+///               keep the statistics apart and has no influence on the
+///               simplification itself.
+///
+/// @return The Simplify pass.
+llvm::Pass *createSimplifyWrapperPass(int CallNo = 0);
+llvm::Pass *createSimplifyPrinterLegacyPass(llvm::raw_ostream &OS);
+
 struct SimplifyPass final : PassInfoMixin<SimplifyPass> {
   SimplifyPass(int CallNo = 0) : CallNo(CallNo) {}
 
@@ -57,8 +73,11 @@ struct SimplifyPrinterPass final : PassInfoMixin<SimplifyPrinterPass> {
   raw_ostream &OS;
   int CallNo;
 };
-
-bool runSimplify(Scop &S, int CallNo);
 } // namespace polly
 
+namespace llvm {
+void initializeSimplifyWrapperPassPass(llvm::PassRegistry &);
+void initializeSimplifyPrinterLegacyPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_TRANSFORM_SIMPLIFY_H */
diff --git a/polly/include/polly/Support/DumpFunctionPass.h b/polly/include/polly/Support/DumpFunctionPass.h
index af04912ed4fe2..e5c16203adb8f 100644
--- a/polly/include/polly/Support/DumpFunctionPass.h
+++ b/polly/include/polly/Support/DumpFunctionPass.h
@@ -16,7 +16,13 @@
 #include "llvm/IR/PassManager.h"
 #include <string>
 
+namespace llvm {
+class FunctionPass;
+class ModulePass;
+} // namespace llvm
+
 namespace polly {
+llvm::FunctionPass *createDumpFunctionWrapperPass(std::string Suffix);
 
 /// A pass that isolates a function into a new Module and writes it into a file.
 struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> {
@@ -27,6 +33,12 @@ struct DumpFunctionPass final : llvm::PassInfoMixin<DumpFunctionPass> {
   llvm::PreservedAnalyses run(llvm::Function &F,
                               llvm::FunctionAnalysisManager &AM);
 };
+
 } // namespace polly
 
+namespace llvm {
+class PassRegistry;
+void initializeDumpFunctionWrapperPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_SUPPORT_DUMPFUNCTIONPASS_H */
diff --git a/polly/include/polly/Support/DumpModulePass.h b/polly/include/polly/Support/DumpModulePass.h
index 6d393a174b19b..c90bbc2484310 100644
--- a/polly/include/polly/Support/DumpModulePass.h
+++ b/polly/include/polly/Support/DumpModulePass.h
@@ -16,8 +16,12 @@
 #include "llvm/IR/PassManager.h"
 #include <string>
 
+namespace llvm {
+class ModulePass;
+} // namespace llvm
+
 namespace polly {
-/// A pass that prints the module into a file.
+/// Create a pass that prints the module into a file.
 ///
 /// The meaning of @p Filename depends on @p IsSuffix. If IsSuffix==false, then
 /// the module is written to the @p Filename. If it is true, the filename is
@@ -26,6 +30,10 @@ namespace polly {
 /// The intent of IsSuffix is to avoid the file being overwritten when
 /// processing multiple modules and/or with multiple dump passes in the
 /// pipeline.
+llvm::ModulePass *createDumpModuleWrapperPass(std::string Filename,
+                                              bool IsSuffix);
+
+/// A pass that prints the module into a file.
 struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> {
   std::string Filename;
   bool IsSuffix;
@@ -38,4 +46,9 @@ struct DumpModulePass final : llvm::PassInfoMixin<DumpModulePass> {
 
 } // namespace polly
 
+namespace llvm {
+class PassRegistry;
+void initializeDumpModuleWrapperPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
 #endif /* POLLY_SUPPORT_DUMPMODULEPASS_H */
diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h
index 38b731a9f7d8d..75891525ff7b3 100644
--- a/polly/include/polly/Support/ScopHelper.h
+++ b/polly/include/polly/Support/ScopHelper.h
@@ -358,6 +358,14 @@ namespace polly {
 void simplifyRegion(llvm::Region *R, llvm::DominatorTree *DT,
                     llvm::LoopInfo *LI, llvm::RegionInfo *RI);
 
+/// Split the entry block of a function to store the newly inserted
+///        allocations outside of all Scops.
+///
+/// @param EntryBlock The entry block of the current function.
+/// @param P          The pass that currently running.
+///
+void splitEntryBlockForAlloca(llvm::BasicBlock *EntryBlock, llvm::Pass *P);
+
 /// Split the entry block of a function to store the newly inserted
 ///        allocations outside of all Scops.
 ///
diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp
index 5183fc5725ece..c620f40ad0724 100644
--- a/polly/lib/Analysis/DependenceInfo.cpp
+++ b/polly/lib/Analysis/DependenceInfo.cpp
@@ -20,6 +20,7 @@
 //===----------------------------------------------------------------------===//
 //
 #include "polly/DependenceInfo.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/Support/GICHelper.h"
@@ -41,10 +42,6 @@ using namespace llvm;
 #include "polly/Support/PollyDebug.h"
 #define DEBUG_TYPE "polly-dependence"
 
-namespace polly {
-Dependences::AnalysisLevel OptAnalysisLevel;
-}
-
 static cl::opt<int> OptComputeOut(
     "polly-dependences-computeout",
     cl::desc("Bound the dependence analysis by a maximal amount of "
@@ -72,10 +69,9 @@ static cl::opt<enum AnalysisType> OptAnalysisType(
                           "Overapproximation of dependences")),
     cl::Hidden, cl::init(VALUE_BASED_ANALYSIS), cl::cat(PollyCategory));
 
-static cl::opt<Dependences::AnalysisLevel, true> XOptAnalysisLevel(
+static cl::opt<Dependences::AnalysisLevel> OptAnalysisLevel(
     "polly-dependences-analysis-level",
     cl::desc("The level of dependence analysis"),
-    cl::location(OptAnalysisLevel),
     cl::values(clEnumValN(Dependences::AL_Statement, "statement-wise",
                           "Statement-level analysis"),
                clEnumValN(Dependences::AL_Reference, "reference-wise",
@@ -885,7 +881,213 @@ DependenceInfoPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-DependenceAnalysis::Result polly::runDependenceAnalysis(Scop &S) {
-  DependenceAnalysis::Result Result{S, {}};
-  return Result;
+const Dependences &
+DependenceInfo::getDependences(Dependences::AnalysisLevel Level) {
+  if (Dependences *d = D[Level].get())
+    return *d;
+
+  return recomputeDependences(Level);
+}
+
+const Dependences &
+DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) {
+  D[Level].reset(new Dependences(S->getSharedIslCtx(), Level));
+  D[Level]->calculateDependences(*S);
+  return *D[Level];
+}
+
+void DependenceInfo::abandonDependences() {
+  for (std::unique_ptr<Dependences> &Deps : D)
+    Deps.release();
+}
+
+bool DependenceInfo::runOnScop(Scop &ScopVar) {
+  S = &ScopVar;
+  return false;
+}
+
+/// Print the dependences for the given SCoP to @p OS.
+
+void polly::DependenceInfo::printScop(raw_ostream &OS, Scop &S) const {
+  if (auto d = D[OptAnalysisLevel].get()) {
+    d->print(OS);
+    return;
+  }
+
+  // Otherwise create the dependences on-the-fly and print it
+  Dependences D(S.getSharedIslCtx(), OptAnalysisLevel);
+  D.calculateDependences(S);
+  D.print(OS);
+}
+
+void DependenceInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredTransitive<ScopInfoRegionPass>();
+  AU.setPreservesAll();
+}
+
+char DependenceInfo::ID = 0;
+
+Pass *polly::createDependenceInfoPass() { return new DependenceInfo(); }
+
+INITIALIZE_PASS_BEGIN(DependenceInfo, "polly-dependences",
+                      "Polly - Calculate dependences", false, false);
+INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
+INITIALIZE_PASS_END(DependenceInfo, "polly-dependences",
+                    "Polly - Calculate dependences", false, false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Print result from DependenceAnalysis.
+class DependenceInfoPrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  DependenceInfoPrinterLegacyPass() : DependenceInfoPrinterLegacyPass(outs()) {}
+
+  explicit DependenceInfoPrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    DependenceInfo &P = getAnalysis<DependenceInfo>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for "
+       << "region: '" << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<DependenceInfo>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char DependenceInfoPrinterLegacyPass::ID = 0;
+} // namespace
+
+Pass *polly::createDependenceInfoPrinterLegacyPass(raw_ostream &OS) {
+  return new DependenceInfoPrinterLegacyPass(OS);
+}
+
+INITIALIZE_PASS_BEGIN(DependenceInfoPrinterLegacyPass,
+                      "polly-print-dependences", "Polly - Print dependences",
+                      false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
+INITIALIZE_PASS_END(DependenceInfoPrinterLegacyPass, "polly-print-dependences",
+                    "Polly - Print dependences", false, false)
+
+//===----------------------------------------------------------------------===//
+
+const Dependences &
+DependenceInfoWrapperPass::getDependences(Scop *S,
+                                          Dependences::AnalysisLevel Level) {
+  auto It = ScopToDepsMap.find(S);
+  if (It != ScopToDepsMap.end())
+    if (It->second) {
+      if (It->second->getDependenceLevel() == Level)
+        return *It->second;
+    }
+  return recomputeDependences(S, Level);
+}
+
+const Dependences &DependenceInfoWrapperPass::recomputeDependences(
+    Scop *S, Dependences::AnalysisLevel Level) {
+  std::unique_ptr<Dependences> D(new Dependences(S->getSharedIslCtx(), Level));
+  D->calculateDependences(*S);
+  auto Inserted = ScopToDepsMap.insert(std::make_pair(S, std::move(D)));
+  return *Inserted.first->second;
 }
+
+bool DependenceInfoWrapperPass::runOnFunction(Function &F) {
+  auto &SI = *getAnalysis<ScopInfoWrapperPass>().getSI();
+  for (auto &It : SI) {
+    assert(It.second && "Invalid SCoP object!");
+    recomputeDependences(It.second.get(), Dependences::AL_Access);
+  }
+  return false;
+}
+
+void DependenceInfoWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  for (auto &It : ScopToDepsMap) {
+    assert((It.first && It.second) && "Invalid Scop or Dependence object!\n");
+    It.second->print(OS);
+  }
+}
+
+void DependenceInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredTransitive<ScopInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+char DependenceInfoWrapperPass::ID = 0;
+
+Pass *polly::createDependenceInfoWrapperPassPass() {
+  return new DependenceInfoWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(
+    DependenceInfoWrapperPass, "polly-function-dependences",
+    "Polly - Calculate dependences for all the SCoPs of a function", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass);
+INITIALIZE_PASS_END(
+    DependenceInfoWrapperPass, "polly-function-dependences",
+    "Polly - Calculate dependences for all the SCoPs of a function", false,
+    false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Print result from DependenceInfoWrapperPass.
+class DependenceInfoPrinterLegacyFunctionPass final : public FunctionPass {
+public:
+  static char ID;
+
+  DependenceInfoPrinterLegacyFunctionPass()
+      : DependenceInfoPrinterLegacyFunctionPass(outs()) {}
+
+  explicit DependenceInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS)
+      : FunctionPass(ID), OS(OS) {}
+
+  bool runOnFunction(Function &F) override {
+    DependenceInfoWrapperPass &P = getAnalysis<DependenceInfoWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for function '"
+       << F.getName() << "':\n";
+    P.print(OS);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<DependenceInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char DependenceInfoPrinterLegacyFunctionPass::ID = 0;
+} // namespace
+
+Pass *polly::createDependenceInfoPrinterLegacyFunctionPass(raw_ostream &OS) {
+  return new DependenceInfoPrinterLegacyFunctionPass(OS);
+}
+
+INITIALIZE_PASS_BEGIN(
+    DependenceInfoPrinterLegacyFunctionPass, "polly-print-function-dependences",
+    "Polly - Print dependences for all the SCoPs of a function", false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfoWrapperPass);
+INITIALIZE_PASS_END(DependenceInfoPrinterLegacyFunctionPass,
+                    "polly-print-function-dependences",
+                    "Polly - Print dependences for all the SCoPs of a function",
+                    false, false)
diff --git a/polly/lib/Analysis/PruneUnprofitable.cpp b/polly/lib/Analysis/PruneUnprofitable.cpp
index 40cc9178da0f3..f8469c03fe55b 100644
--- a/polly/lib/Analysis/PruneUnprofitable.cpp
+++ b/polly/lib/Analysis/PruneUnprofitable.cpp
@@ -55,9 +55,8 @@ static void updateStatistics(Scop &S, bool Pruned) {
     NumAffineLoops += ScopStats.NumAffineLoops;
   }
 }
-} // namespace
 
-bool polly::runPruneUnprofitable(Scop &S) {
+static bool runPruneUnprofitable(Scop &S) {
   if (PollyProcessUnprofitable) {
     POLLY_DEBUG(
         dbgs() << "NOTE: -polly-process-unprofitable active, won't prune "
@@ -80,6 +79,35 @@ bool polly::runPruneUnprofitable(Scop &S) {
   return false;
 }
 
+class PruneUnprofitableWrapperPass final : public ScopPass {
+public:
+  static char ID;
+
+  explicit PruneUnprofitableWrapperPass() : ScopPass(ID) {}
+  PruneUnprofitableWrapperPass(const PruneUnprofitableWrapperPass &) = delete;
+  PruneUnprofitableWrapperPass &
+  operator=(const PruneUnprofitableWrapperPass &) = delete;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScopInfoRegionPass>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnScop(Scop &S) override { return runPruneUnprofitable(S); }
+};
+} // namespace
+
+char PruneUnprofitableWrapperPass::ID;
+
+Pass *polly::createPruneUnprofitableWrapperPass() {
+  return new PruneUnprofitableWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(PruneUnprofitableWrapperPass, "polly-prune-unprofitable",
+                      "Polly - Prune unprofitable SCoPs", false, false)
+INITIALIZE_PASS_END(PruneUnprofitableWrapperPass, "polly-prune-unprofitable",
+                    "Polly - Prune unprofitable SCoPs", false, false)
+
 llvm::PreservedAnalyses
 PruneUnprofitablePass::run(Scop &S, ScopAnalysisManager &SAM,
                            ScopStandardAnalysisResults &SAR, SPMUpdater &U) {
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 60a1e00916750..67a4c43455809 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -56,7 +56,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <deque>
 
 using namespace llvm;
 using namespace polly;
diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp
index 29e89348125f2..43ed8636b054b 100644
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@@ -44,6 +44,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScopDetection.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopDetectionDiagnostic.h"
 #include "polly/Support/SCEVValidator.h"
@@ -74,6 +75,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
@@ -1980,12 +1983,53 @@ void ScopDetection::verifyAnalysis() {
     verifyRegion(*R);
 }
 
+bool ScopDetectionWrapperPass::runOnFunction(Function &F) {
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+  Result = std::make_unique<ScopDetection>(DT, SE, LI, RI, AA, ORE);
+  Result->detect(F);
+  return false;
+}
+
+void ScopDetectionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  // We also need AA and RegionInfo when we are verifying analysis.
+  AU.addRequiredTransitive<AAResultsWrapperPass>();
+  AU.addRequiredTransitive<RegionInfoPass>();
+  AU.setPreservesAll();
+}
+
+void ScopDetectionWrapperPass::print(raw_ostream &OS, const Module *) const {
+  for (const Region *R : Result->ValidRegions)
+    OS << "Valid Region for Scop: " << R->getNameStr() << '\n';
+
+  OS << "\n";
+}
+
+ScopDetectionWrapperPass::ScopDetectionWrapperPass() : FunctionPass(ID) {
+  // Disable runtime alias checks if we ignore aliasing all together.
+  if (IgnoreAliasing)
+    PollyUseRuntimeAliasChecks = false;
+}
+
 ScopAnalysis::ScopAnalysis() {
   // Disable runtime alias checks if we ignore aliasing all together.
   if (IgnoreAliasing)
     PollyUseRuntimeAliasChecks = false;
 }
 
+void ScopDetectionWrapperPass::releaseMemory() { Result.reset(); }
+
+char ScopDetectionWrapperPass::ID;
+
 AnalysisKey ScopAnalysis::Key;
 
 ScopDetection ScopAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
@@ -2011,3 +2055,66 @@ PreservedAnalyses ScopAnalysisPrinterPass::run(Function &F,
   OS << "\n";
   return PreservedAnalyses::all();
 }
+
+Pass *polly::createScopDetectionWrapperPassPass() {
+  return new ScopDetectionWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(ScopDetectionWrapperPass, "polly-detect",
+                      "Polly - Detect static control parts (SCoPs)", false,
+                      false);
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
+INITIALIZE_PASS_END(ScopDetectionWrapperPass, "polly-detect",
+                    "Polly - Detect static control parts (SCoPs)", false, false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Print result from ScopDetectionWrapperPass.
+class ScopDetectionPrinterLegacyPass final : public FunctionPass {
+public:
+  static char ID;
+
+  ScopDetectionPrinterLegacyPass() : ScopDetectionPrinterLegacyPass(outs()) {}
+
+  explicit ScopDetectionPrinterLegacyPass(llvm::raw_ostream &OS)
+      : FunctionPass(ID), OS(OS) {}
+
+  bool runOnFunction(Function &F) override {
+    ScopDetectionWrapperPass &P = getAnalysis<ScopDetectionWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for function '"
+       << F.getName() << "':\n";
+    P.print(OS);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<ScopDetectionWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char ScopDetectionPrinterLegacyPass::ID = 0;
+} // namespace
+
+Pass *polly::createScopDetectionPrinterLegacyPass(raw_ostream &OS) {
+  return new ScopDetectionPrinterLegacyPass(OS);
+}
+
+INITIALIZE_PASS_BEGIN(ScopDetectionPrinterLegacyPass, "polly-print-detect",
+                      "Polly - Print static control parts (SCoPs)", false,
+                      false);
+INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
+INITIALIZE_PASS_END(ScopDetectionPrinterLegacyPass, "polly-print-detect",
+                    "Polly - Print static control parts (SCoPs)", false, false)
diff --git a/polly/lib/Analysis/ScopGraphPrinter.cpp b/polly/lib/Analysis/ScopGraphPrinter.cpp
index 29e212882cefe..eb6c995f0bb91 100644
--- a/polly/lib/Analysis/ScopGraphPrinter.cpp
+++ b/polly/lib/Analysis/ScopGraphPrinter.cpp
@@ -14,26 +14,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScopGraphPrinter.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/ScopDetection.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace polly;
 using namespace llvm;
+static cl::opt<std::string>
+    ViewFilter("polly-view-only",
+               cl::desc("Only view functions that match this pattern"),
+               cl::Hidden, cl::init(""));
 
-namespace polly {
-std::string ViewFilter;
-bool ViewAll;
-} // namespace polly
-
-static cl::opt<std::string, true>
-    XViewFilter("polly-view-only",
-                cl::desc("Only view functions that match this pattern"),
-                cl::location(ViewFilter), cl::Hidden, cl::init(""));
-
-static cl::opt<bool, true>
-    XViewAll("polly-view-all",
-             cl::desc("Also show functions without any scops"),
-             cl::location(ViewAll), cl::Hidden, cl::init(false));
+static cl::opt<bool> ViewAll("polly-view-all",
+                             cl::desc("Also show functions without any scops"),
+                             cl::Hidden, cl::init(false));
 
 namespace llvm {
 
@@ -140,6 +134,104 @@ void DOTGraphTraits<ScopDetection *>::addCustomGraphFeatures(
 
 } // namespace llvm
 
+struct ScopDetectionAnalysisGraphTraits {
+  static ScopDetection *getGraph(ScopDetectionWrapperPass *Analysis) {
+    return &Analysis->getSD();
+  }
+};
+
+struct ScopViewerWrapperPass
+    : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
+                                      ScopDetection *,
+                                      ScopDetectionAnalysisGraphTraits> {
+  static char ID;
+  ScopViewerWrapperPass()
+      : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
+                                        ScopDetection *,
+                                        ScopDetectionAnalysisGraphTraits>(
+            "scops", ID) {}
+  bool processFunction(Function &F, ScopDetectionWrapperPass &SD) override {
+    if (ViewFilter != "" && !F.getName().count(ViewFilter))
+      return false;
+
+    if (ViewAll)
+      return true;
+
+    // Check that at least one scop was detected.
+    return std::distance(SD.getSD().begin(), SD.getSD().end()) > 0;
+  }
+};
+char ScopViewerWrapperPass::ID = 0;
+
+struct ScopOnlyViewerWrapperPass
+    : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
+                                      ScopDetection *,
+                                      ScopDetectionAnalysisGraphTraits> {
+  static char ID;
+  ScopOnlyViewerWrapperPass()
+      : DOTGraphTraitsViewerWrapperPass<ScopDetectionWrapperPass, false,
+                                        ScopDetection *,
+                                        ScopDetectionAnalysisGraphTraits>(
+            "scopsonly", ID) {}
+};
+char ScopOnlyViewerWrapperPass::ID = 0;
+
+struct ScopPrinterWrapperPass
+    : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false,
+                                       ScopDetection *,
+                                       ScopDetectionAnalysisGraphTraits> {
+  static char ID;
+  ScopPrinterWrapperPass()
+      : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, false,
+                                         ScopDetection *,
+                                         ScopDetectionAnalysisGraphTraits>(
+            "scops", ID) {}
+};
+char ScopPrinterWrapperPass::ID = 0;
+
+struct ScopOnlyPrinterWrapperPass
+    : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true,
+                                       ScopDetection *,
+                                       ScopDetectionAnalysisGraphTraits> {
+  static char ID;
+  ScopOnlyPrinterWrapperPass()
+      : DOTGraphTraitsPrinterWrapperPass<ScopDetectionWrapperPass, true,
+                                         ScopDetection *,
+                                         ScopDetectionAnalysisGraphTraits>(
+            "scopsonly", ID) {}
+};
+char ScopOnlyPrinterWrapperPass::ID = 0;
+
+static RegisterPass<ScopViewerWrapperPass> X("view-scops",
+                                             "Polly - View Scops of function");
+
+static RegisterPass<ScopOnlyViewerWrapperPass>
+    Y("view-scops-only",
+      "Polly - View Scops of function (with no function bodies)");
+
+static RegisterPass<ScopPrinterWrapperPass>
+    M("dot-scops", "Polly - Print Scops of function");
+
+static RegisterPass<ScopOnlyPrinterWrapperPass>
+    N("dot-scops-only",
+      "Polly - Print Scops of function (with no function bodies)");
+
+Pass *polly::createDOTViewerWrapperPass() {
+  return new ScopViewerWrapperPass();
+}
+
+Pass *polly::createDOTOnlyViewerWrapperPass() {
+  return new ScopOnlyViewerWrapperPass();
+}
+
+Pass *polly::createDOTPrinterWrapperPass() {
+  return new ScopPrinterWrapperPass();
+}
+
+Pass *polly::createDOTOnlyPrinterWrapperPass() {
+  return new ScopOnlyPrinterWrapperPass();
+}
+
 bool ScopViewer::processFunction(Function &F, const ScopDetection &SD) {
   if (ViewFilter != "" && !F.getName().count(ViewFilter))
     return false;
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index 70e184d3f897f..8c6a2360a249b 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/ScopInfo.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopBuilder.h"
 #include "polly/ScopDetection.h"
@@ -56,6 +57,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -2542,6 +2544,19 @@ raw_ostream &polly::operator<<(raw_ostream &OS, const Scop &scop) {
   return OS;
 }
 
+//===----------------------------------------------------------------------===//
+void ScopInfoRegionPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<RegionInfoPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
+  AU.addRequiredTransitive<ScopDetectionWrapperPass>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  AU.setPreservesAll();
+}
+
 void updateLoopCountStatistic(ScopDetection::LoopStats Stats,
                               Scop::ScopStatistics ScopStats) {
   assert(Stats.NumLoops == ScopStats.NumAffineLoops + ScopStats.NumBoxedLoops);
@@ -2577,6 +2592,112 @@ void updateLoopCountStatistic(ScopDetection::LoopStats Stats,
   NumSingletonWritesInLoops += ScopStats.NumSingletonWritesInLoops;
 }
 
+bool ScopInfoRegionPass::runOnRegion(Region *R, RGPassManager &RGM) {
+  auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD();
+
+  if (!SD.isMaxRegionInScop(*R))
+    return false;
+
+  Function *F = R->getEntry()->getParent();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto const &DL = F->getParent()->getDataLayout();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
+  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+  ScopBuilder SB(R, AC, AA, DL, DT, LI, SD, SE, ORE);
+  S = SB.getScop(); // take ownership of scop object
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+  if (S) {
+    ScopDetection::LoopStats Stats =
+        ScopDetection::countBeneficialLoops(&S->getRegion(), SE, LI, 0);
+    updateLoopCountStatistic(Stats, S->getStatistics());
+  }
+#endif
+
+  return false;
+}
+
+void ScopInfoRegionPass::print(raw_ostream &OS, const Module *) const {
+  if (S)
+    S->print(OS, PollyPrintInstructions);
+  else
+    OS << "Invalid Scop!\n";
+}
+
+char ScopInfoRegionPass::ID = 0;
+
+Pass *polly::createScopInfoRegionPassPass() { return new ScopInfoRegionPass(); }
+
+INITIALIZE_PASS_BEGIN(ScopInfoRegionPass, "polly-scops",
+                      "Polly - Create polyhedral description of Scops", false,
+                      false);
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_END(ScopInfoRegionPass, "polly-scops",
+                    "Polly - Create polyhedral description of Scops", false,
+                    false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Print result from ScopInfoRegionPass.
+class ScopInfoPrinterLegacyRegionPass final : public RegionPass {
+public:
+  static char ID;
+
+  ScopInfoPrinterLegacyRegionPass() : ScopInfoPrinterLegacyRegionPass(outs()) {}
+
+  explicit ScopInfoPrinterLegacyRegionPass(llvm::raw_ostream &OS)
+      : RegionPass(ID), OS(OS) {}
+
+  bool runOnRegion(Region *R, RGPassManager &RGM) override {
+    ScopInfoRegionPass &P = getAnalysis<ScopInfoRegionPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << R->getNameStr() << "' in function '"
+       << R->getEntry()->getParent()->getName() << "':\n";
+    P.print(OS);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    RegionPass::getAnalysisUsage(AU);
+    AU.addRequired<ScopInfoRegionPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char ScopInfoPrinterLegacyRegionPass::ID = 0;
+} // namespace
+
+Pass *polly::createScopInfoPrinterLegacyRegionPass(raw_ostream &OS) {
+  return new ScopInfoPrinterLegacyRegionPass(OS);
+}
+
+INITIALIZE_PASS_BEGIN(ScopInfoPrinterLegacyRegionPass, "polly-print-scops",
+                      "Polly - Print polyhedral description of Scops", false,
+                      false);
+INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
+INITIALIZE_PASS_END(ScopInfoPrinterLegacyRegionPass, "polly-print-scops",
+                    "Polly - Print polyhedral description of Scops", false,
+                    false)
+
+//===----------------------------------------------------------------------===//
+
 ScopInfo::ScopInfo(const DataLayout &DL, ScopDetection &SD, ScalarEvolution &SE,
                    LoopInfo &LI, AliasAnalysis &AA, DominatorTree &DT,
                    AssumptionCache &AC, OptimizationRemarkEmitter &ORE)
@@ -2650,3 +2771,110 @@ PreservedAnalyses ScopInfoPrinterPass::run(Function &F,
   }
   return PreservedAnalyses::all();
 }
+
+void ScopInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<RegionInfoPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
+  AU.addRequiredTransitive<ScopDetectionWrapperPass>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  AU.setPreservesAll();
+}
+
+bool ScopInfoWrapperPass::runOnFunction(Function &F) {
+  auto &SD = getAnalysis<ScopDetectionWrapperPass>().getSD();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+  auto const &DL = F.getParent()->getDataLayout();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+  Result.reset(new ScopInfo{DL, SD, SE, LI, AA, DT, AC, ORE});
+  return false;
+}
+
+void ScopInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
+  for (auto &It : *Result) {
+    if (It.second)
+      It.second->print(OS, PollyPrintInstructions);
+    else
+      OS << "Invalid Scop!\n";
+  }
+}
+
+char ScopInfoWrapperPass::ID = 0;
+
+Pass *polly::createScopInfoWrapperPassPass() {
+  return new ScopInfoWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(
+    ScopInfoWrapperPass, "polly-function-scops",
+    "Polly - Create polyhedral description of all Scops of a function", false,
+    false);
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_END(
+    ScopInfoWrapperPass, "polly-function-scops",
+    "Polly - Create polyhedral description of all Scops of a function", false,
+    false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Print result from ScopInfoWrapperPass.
+class ScopInfoPrinterLegacyFunctionPass final : public FunctionPass {
+public:
+  static char ID;
+
+  ScopInfoPrinterLegacyFunctionPass()
+      : ScopInfoPrinterLegacyFunctionPass(outs()) {}
+  explicit ScopInfoPrinterLegacyFunctionPass(llvm::raw_ostream &OS)
+      : FunctionPass(ID), OS(OS) {}
+
+  bool runOnFunction(Function &F) override {
+    ScopInfoWrapperPass &P = getAnalysis<ScopInfoWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for function '"
+       << F.getName() << "':\n";
+    P.print(OS);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<ScopInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char ScopInfoPrinterLegacyFunctionPass::ID = 0;
+} // namespace
+
+Pass *polly::createScopInfoPrinterLegacyFunctionPass(raw_ostream &OS) {
+  return new ScopInfoPrinterLegacyFunctionPass(OS);
+}
+
+INITIALIZE_PASS_BEGIN(
+    ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops",
+    "Polly - Print polyhedral description of all Scops of a function", false,
+    false);
+INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass);
+INITIALIZE_PASS_END(
+    ScopInfoPrinterLegacyFunctionPass, "polly-print-function-scops",
+    "Polly - Print polyhedral description of all Scops of a function", false,
+    false)
diff --git a/polly/lib/Analysis/ScopPass.cpp b/polly/lib/Analysis/ScopPass.cpp
index 61417e799cfa5..719cd0f6984e0 100644
--- a/polly/lib/Analysis/ScopPass.cpp
+++ b/polly/lib/Analysis/ScopPass.cpp
@@ -24,6 +24,42 @@
 using namespace llvm;
 using namespace polly;
 
+bool ScopPass::runOnRegion(Region *R, RGPassManager &RGM) {
+  S = nullptr;
+
+  if (skipRegion(*R))
+    return false;
+
+  if ((S = getAnalysis<ScopInfoRegionPass>().getScop()))
+    return runOnScop(*S);
+
+  return false;
+}
+
+void ScopPass::print(raw_ostream &OS, const Module *M) const {
+  if (S)
+    printScop(OS, *S);
+}
+
+void ScopPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ScopInfoRegionPass>();
+
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<ScopDetectionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+  AU.addPreserved<LazyBlockFrequencyInfoPass>();
+  AU.addPreserved<LazyBranchProbabilityInfoPass>();
+  AU.addPreserved<RegionInfoPass>();
+  AU.addPreserved<ScopInfoRegionPass>();
+  AU.addPreserved<TargetTransformInfoWrapperPass>();
+}
+
 namespace polly {
 template class OwningInnerAnalysisManagerProxy<ScopAnalysisManager, Function>;
 }
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
index e4f196f151c9e..0ed673815ff34 100644
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -60,9 +60,6 @@ add_llvm_pass_plugin(Polly
   CodeGen/RuntimeDebugBuilder.cpp
   CodeGen/PerfMonitor.cpp
   Exchange/JSONExporter.cpp
-  Pass/PhaseManager.cpp
-  Pass/PollyFunctionPass.cpp
-  Pass/PollyModulePass.cpp
   Support/GICHelper.cpp
   Support/PollyDebug.cpp
   Support/SCEVAffinator.cpp
diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
index f2d5a3422849e..2d8b393cc039c 100644
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@@ -25,6 +25,7 @@
 #include "polly/CodeGen/PerfMonitor.h"
 #include "polly/CodeGen/Utils.h"
 #include "polly/DependenceInfo.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/Support/ScopHelper.h"
@@ -36,6 +37,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -312,6 +314,59 @@ static bool generateCode(Scop &S, IslAstInfo &AI, LoopInfo &LI,
   return true;
 }
 
+namespace {
+
+class CodeGeneration final : public ScopPass {
+public:
+  static char ID;
+
+  /// The data layout used.
+  const DataLayout *DL;
+
+  /// @name The analysis passes we need to generate code.
+  ///
+  ///{
+  LoopInfo *LI;
+  IslAstInfo *AI;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  RegionInfo *RI;
+  ///}
+
+  CodeGeneration() : ScopPass(ID) {}
+
+  /// Generate LLVM-IR for the SCoP @p S.
+  bool runOnScop(Scop &S) override {
+    AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
+    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    DL = &S.getFunction().getDataLayout();
+    RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
+    return generateCode(S, *AI, *LI, *DT, *SE, *RI);
+  }
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<IslAstInfoWrapperPass>();
+    AU.addRequired<RegionInfoPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<ScopDetectionWrapperPass>();
+    AU.addRequired<ScopInfoRegionPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+
+    AU.addPreserved<DependenceInfo>();
+    AU.addPreserved<IslAstInfoWrapperPass>();
+
+    // FIXME: We do not yet add regions for the newly generated code to the
+    //        region tree.
+  }
+};
+} // namespace
+
 PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM,
                                           ScopStandardAnalysisResults &AR,
                                           SPMUpdater &U) {
@@ -324,6 +379,17 @@ PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-bool polly::runCodeGeneration(Scop &S, RegionInfo &RI, IslAstInfo &AI) {
-  return generateCode(S, AI, *S.getLI(), *S.getDT(), *S.getSE(), RI);
-}
+char CodeGeneration::ID = 1;
+
+Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); }
+
+INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
+                      "Polly - Create LLVM-IR from SCoPs", false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
+INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
+                    "Polly - Create LLVM-IR from SCoPs", false, false)
diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp
index 3177cda225f1d..09bacda196742 100644
--- a/polly/lib/CodeGen/IslAst.cpp
+++ b/polly/lib/CodeGen/IslAst.cpp
@@ -29,6 +29,7 @@
 #include "polly/CodeGen/IslAst.h"
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/DependenceInfo.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopDetection.h"
 #include "polly/ScopInfo.h"
@@ -82,11 +83,6 @@ static cl::opt<bool> DetectParallel("polly-ast-detect-parallel",
                                     cl::desc("Detect parallelism"), cl::Hidden,
                                     cl::cat(PollyCategory));
 
-static cl::opt<bool>
-    PollyPrintAst("polly-print-ast",
-                  cl::desc("Print the ISL abstract syntax tree"),
-                  cl::cat(PollyCategory));
-
 STATISTIC(ScopsProcessed, "Number of SCoPs processed");
 STATISTIC(ScopsBeneficial, "Number of beneficial SCoPs");
 STATISTIC(BeneficialAffineLoops, "Number of beneficial affine loops");
@@ -780,19 +776,90 @@ PreservedAnalyses IslAstPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
-std::unique_ptr<IslAstInfo>
-polly::runIslAstGen(Scop &S, DependenceAnalysis::Result &DA) {
-  auto GetDeps = [&](Dependences::AnalysisLevel Lvl) -> const Dependences & {
-    return DA.getDependences(Lvl);
+void IslAstInfoWrapperPass::releaseMemory() { Ast.reset(); }
+
+bool IslAstInfoWrapperPass::runOnScop(Scop &Scop) {
+  auto GetDeps = [this](Dependences::AnalysisLevel Lvl) -> const Dependences & {
+    return getAnalysis<DependenceInfo>().getDependences(Lvl);
   };
 
-  std::unique_ptr<IslAstInfo> Result = runIslAst(S, GetDeps);
-  if (PollyPrintAst) {
-    outs() << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'"
-           << S.getName() << "' in function '" << S.getFunction().getName()
-           << "':\n";
-    if (Result)
-      Result->print(llvm::outs());
+  Ast = runIslAst(Scop, GetDeps);
+
+  return false;
+}
+
+void IslAstInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  // Get the Common analysis usage of ScopPasses.
+  ScopPass::getAnalysisUsage(AU);
+  AU.addRequiredTransitive<ScopInfoRegionPass>();
+  AU.addRequired<DependenceInfo>();
+
+  AU.addPreserved<DependenceInfo>();
+}
+
+void IslAstInfoWrapperPass::printScop(raw_ostream &OS, Scop &S) const {
+  OS << "Printing analysis 'Polly - Generate an AST of the SCoP (isl)'"
+     << S.getName() << "' in function '" << S.getFunction().getName() << "':\n";
+  if (Ast)
+    Ast->print(OS);
+}
+
+char IslAstInfoWrapperPass::ID = 0;
+
+Pass *polly::createIslAstInfoWrapperPassPass() {
+  return new IslAstInfoWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(IslAstInfoWrapperPass, "polly-ast",
+                      "Polly - Generate an AST of the SCoP (isl)", false,
+                      false);
+INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
+INITIALIZE_PASS_END(IslAstInfoWrapperPass, "polly-ast",
+                    "Polly - Generate an AST from the SCoP (isl)", false, false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Print result from IslAstInfoWrapperPass.
+class IslAstInfoPrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  IslAstInfoPrinterLegacyPass() : IslAstInfoPrinterLegacyPass(outs()) {}
+  explicit IslAstInfoPrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    IslAstInfoWrapperPass &P = getAnalysis<IslAstInfoWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
   }
-  return Result;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<IslAstInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char IslAstInfoPrinterLegacyPass::ID = 0;
+} // namespace
+
+Pass *polly::createIslAstInfoPrinterLegacyPass(raw_ostream &OS) {
+  return new IslAstInfoPrinterLegacyPass(OS);
 }
+
+INITIALIZE_PASS_BEGIN(IslAstInfoPrinterLegacyPass, "polly-print-ast",
+                      "Polly - Print the AST from a SCoP (isl)", false, false);
+INITIALIZE_PASS_DEPENDENCY(IslAstInfoWrapperPass);
+INITIALIZE_PASS_END(IslAstInfoPrinterLegacyPass, "polly-print-ast",
+                    "Polly - Print the AST from a SCoP (isl)", false, false)
diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp
index 7d30c030aa6e1..dfd63146edb5e 100644
--- a/polly/lib/Exchange/JSONExporter.cpp
+++ b/polly/lib/Exchange/JSONExporter.cpp
@@ -12,6 +12,7 @@
 
 #include "polly/JSONExporter.h"
 #include "polly/DependenceInfo.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
@@ -35,11 +36,6 @@ using namespace polly;
 
 #define DEBUG_TYPE "polly-import-jscop"
 
-static cl::opt<bool>
-    PollyPrintImportJscop("polly-print-import-jscop",
-                          cl::desc("Polly - Print Scop import result"),
-                          cl::cat(PollyCategory));
-
 STATISTIC(NewAccessMapFound, "Number of updated access functions");
 
 namespace {
@@ -54,6 +50,36 @@ static cl::opt<std::string>
                   cl::desc("Postfix to append to the import .jsop files."),
                   cl::Hidden, cl::value_desc("File postfix"), cl::ValueRequired,
                   cl::init(""), cl::cat(PollyCategory));
+
+class JSONExporter : public ScopPass {
+public:
+  static char ID;
+  explicit JSONExporter() : ScopPass(ID) {}
+
+  /// Export the SCoP @p S to a JSON file.
+  bool runOnScop(Scop &S) override;
+
+  /// Print the SCoP @p S as it is exported.
+  void printScop(raw_ostream &OS, Scop &S) const override;
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+class JSONImporter : public ScopPass {
+public:
+  static char ID;
+  std::vector<std::string> NewAccessStrings;
+  explicit JSONImporter() : ScopPass(ID) {}
+  /// Import new access functions for SCoP @p S from a JSON file.
+  bool runOnScop(Scop &S) override;
+
+  /// Print the SCoP @p S and the imported access functions.
+  void printScop(raw_ostream &OS, Scop &S) const override;
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
 } // namespace
 
 static std::string getFileName(Scop &S, StringRef Suffix = "") {
@@ -716,6 +742,21 @@ static bool importScop(Scop &S, const Dependences &D, const DataLayout &DL,
   return true;
 }
 
+char JSONExporter::ID = 0;
+void JSONExporter::printScop(raw_ostream &OS, Scop &S) const { OS << S; }
+
+bool JSONExporter::runOnScop(Scop &S) {
+  exportScop(S);
+  return false;
+}
+
+void JSONExporter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<ScopInfoRegionPass>();
+}
+
+Pass *polly::createJSONExporterPass() { return new JSONExporter(); }
+
 PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM,
                                       ScopStandardAnalysisResults &SAR,
                                       SPMUpdater &) {
@@ -723,6 +764,37 @@ PreservedAnalyses JSONExportPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PreservedAnalyses::all();
 }
 
+char JSONImporter::ID = 0;
+
+void JSONImporter::printScop(raw_ostream &OS, Scop &S) const {
+  OS << S;
+  for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(),
+                                                E = NewAccessStrings.end();
+       I != E; I++)
+    OS << "New access function '" << *I << "' detected in JSCOP file\n";
+}
+
+bool JSONImporter::runOnScop(Scop &S) {
+  const Dependences &D =
+      getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
+  const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
+
+  if (!importScop(S, D, DL, &NewAccessStrings))
+    report_fatal_error("Tried to import a malformed jscop file.");
+
+  return false;
+}
+
+void JSONImporter::getAnalysisUsage(AnalysisUsage &AU) const {
+  ScopPass::getAnalysisUsage(AU);
+  AU.addRequired<DependenceInfo>();
+
+  // TODO: JSONImporter should throw away DependenceInfo.
+  AU.addPreserved<DependenceInfo>();
+}
+
+Pass *polly::createJSONImporterPass() { return new JSONImporter(); }
+
 PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM,
                                       ScopStandardAnalysisResults &SAR,
                                       SPMUpdater &) {
@@ -742,24 +814,68 @@ PreservedAnalyses JSONImportPass::run(Scop &S, ScopAnalysisManager &SAM,
   return PA;
 }
 
-void polly::runImportJSON(Scop &S, DependenceAnalysis::Result &DA) {
-  const Dependences &D = DA.getDependences(Dependences::AL_Statement);
-  const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
-  std::vector<std::string> NewAccessStrings;
-  if (!importScop(S, D, DL, &NewAccessStrings))
-    report_fatal_error("Tried to import a malformed jscop file.");
+INITIALIZE_PASS_BEGIN(JSONExporter, "polly-export-jscop",
+                      "Polly - Export Scops as JSON"
+                      " (Writes a .jscop file for each Scop)",
+                      false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo)
+INITIALIZE_PASS_END(JSONExporter, "polly-export-jscop",
+                    "Polly - Export Scops as JSON"
+                    " (Writes a .jscop file for each Scop)",
+                    false, false)
+
+INITIALIZE_PASS_BEGIN(JSONImporter, "polly-import-jscop",
+                      "Polly - Import Scops from JSON"
+                      " (Reads a .jscop file for each Scop)",
+                      false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo)
+INITIALIZE_PASS_END(JSONImporter, "polly-import-jscop",
+                    "Polly - Import Scops from JSON"
+                    " (Reads a .jscop file for each Scop)",
+                    false, false)
+
+//===----------------------------------------------------------------------===//
 
-  if (PollyPrintImportJscop) {
-    outs()
-        << "Printing analysis 'Polly - Print Scop import result' for region: '"
-        << S.getRegion().getNameStr() << "' in function '"
-        << S.getFunction().getName() << "':\n";
-    outs() << S;
-    for (std::vector<std::string>::const_iterator I = NewAccessStrings.begin(),
-                                                  E = NewAccessStrings.end();
-         I != E; I++)
-      outs() << "New access function '" << *I << "' detected in JSCOP file\n";
+namespace {
+/// Print result from JSONImporter.
+class JSONImporterPrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  JSONImporterPrinterLegacyPass() : JSONImporterPrinterLegacyPass(outs()) {}
+  explicit JSONImporterPrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    JSONImporter &P = getAnalysis<JSONImporter>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
   }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<JSONImporter>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char JSONImporterPrinterLegacyPass::ID = 0;
+} // namespace
+
+Pass *polly::createJSONImporterPrinterLegacyPass(llvm::raw_ostream &OS) {
+  return new JSONImporterPrinterLegacyPass(OS);
 }
 
-void polly::runExportJSON(Scop &S) { exportScop(S); }
+INITIALIZE_PASS_BEGIN(JSONImporterPrinterLegacyPass, "polly-print-import-jscop",
+                      "Polly - Print Scop import result", false, false)
+INITIALIZE_PASS_DEPENDENCY(JSONImporter)
+INITIALIZE_PASS_END(JSONImporterPrinterLegacyPass, "polly-print-import-jscop",
+                    "Polly - Print Scop import result", false, false)
diff --git a/polly/lib/Pass/PhaseManager.cpp b/polly/lib/Pass/PhaseManager.cpp
deleted file mode 100644
index fb76c811859b8..0000000000000
--- a/polly/lib/Pass/PhaseManager.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-//===------ PhaseManager.cpp ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/Pass/PhaseManager.h"
-#include "polly/CodeGen/CodeGeneration.h"
-#include "polly/CodeGen/IslAst.h"
-#include "polly/CodePreparation.h"
-#include "polly/DeLICM.h"
-#include "polly/DeadCodeElimination.h"
-#include "polly/DependenceInfo.h"
-#include "polly/FlattenSchedule.h"
-#include "polly/ForwardOpTree.h"
-#include "polly/JSONExporter.h"
-#include "polly/MaximalStaticExpansion.h"
-#include "polly/PruneUnprofitable.h"
-#include "polly/ScheduleOptimizer.h"
-#include "polly/ScopDetection.h"
-#include "polly/ScopDetectionDiagnostic.h"
-#include "polly/ScopGraphPrinter.h"
-#include "polly/ScopInfo.h"
-#include "polly/Simplify.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/IR/Module.h"
-
-#define DEBUG_TYPE "polly-pass"
-
-using namespace polly;
-using namespace llvm;
-
-namespace {
-
-/// Recurse through all subregions and all regions and add them to RQ.
-static void addRegionIntoQueue(Region &R, SmallVector<Region *> &RQ) {
-  RQ.push_back(&R);
-  for (const auto &E : R)
-    addRegionIntoQueue(*E, RQ);
-}
-
-/// The phase pipeline of Polly to be embedded into another pass manager than
-/// runs passes on functions.
-///
-/// Polly holds state besides LLVM-IR (RegionInfo and ScopInfo) between phases
-/// that LLVM pass managers do not consider when scheduling analyses and passes.
-/// That is, the ScopInfo must persist between phases that a pass manager must
-/// not invalidate to recompute later.
-class PhaseManager {
-private:
-  Function &F;
-  FunctionAnalysisManager &FAM;
-  PollyPassOptions Opts;
-
-public:
-  PhaseManager(Function &F, FunctionAnalysisManager &FAM, PollyPassOptions Opts)
-      : F(F), FAM(FAM), Opts(std::move(Opts)) {}
-
-  /// Execute Polly's phases as indicated by the options.
-  bool run() {
-    // Get analyses from the function pass manager.
-    // These must be preserved during all phases so that if processing one SCoP
-    // has finished, the next SCoP can still use them. Recomputing is not an
-    // option because ScopDetection stores references to the old results.
-    // TODO: CodePreparation doesn't actually need these analysis, it just keeps
-    // them up-to-date. If they are not computed yet, can also compute after the
-    // prepare phase.
-    LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
-    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
-    bool ModifiedIR = false;
-
-    // Phase: prepare
-    // TODO: Setting ModifiedIR will invalidate any analysis, even if DT, LI are
-    // preserved.
-    if (Opts.isPhaseEnabled(PassPhase::Prepare)) {
-      PreservedAnalyses PA = CodePreparationPass().run(F, FAM);
-      FAM.invalidate(F, PA);
-      if (!PA.areAllPreserved())
-        ModifiedIR = true;
-    }
-
-    // Can't do anything without detection
-    if (!Opts.isPhaseEnabled(PassPhase::Detection))
-      return false;
-
-    AAResults &AA = FAM.getResult<AAManager>(F);
-    ScalarEvolution &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
-    OptimizationRemarkEmitter &ORE =
-        FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
-    // ScopDetection is modifying RegionInfo, do not cache it, nor use a cached
-    // version.
-    RegionInfo RI = RegionInfoAnalysis().run(F, FAM);
-
-    // Phase: detection
-    ScopDetection SD(DT, SE, LI, RI, AA, ORE);
-    SD.detect(F);
-    if (Opts.isPhaseEnabled(PassPhase::PrintDetect)) {
-      outs() << "Detected Scops in Function " << F.getName() << "\n";
-      for (const Region *R : SD.ValidRegions)
-        outs() << "Valid Region for Scop: " << R->getNameStr() << '\n';
-      outs() << "\n";
-    }
-
-    if (Opts.isPhaseEnabled(PassPhase::DotScops))
-      printGraphForFunction(F, &SD, "scops", false);
-    if (Opts.isPhaseEnabled(PassPhase::DotScopsOnly))
-      printGraphForFunction(F, &SD, "scopsonly", true);
-
-    auto ViewScops = [&](const char *Name, bool IsSimply) {
-      if (Opts.ViewFilter.empty() && !F.getName().count(Opts.ViewFilter))
-        return;
-
-      if (Opts.ViewAll || std::distance(SD.begin(), SD.end()) > 0)
-        viewGraphForFunction(F, &SD, Name, IsSimply);
-    };
-    if (Opts.isPhaseEnabled(PassPhase::ViewScops))
-      ViewScops("scops", false);
-    if (Opts.isPhaseEnabled(PassPhase::ViewScopsOnly))
-      ViewScops("scopsonly", true);
-
-    // Phase: scops
-    AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
-    const DataLayout &DL = F.getParent()->getDataLayout();
-    ScopInfo Info(DL, SD, SE, LI, AA, DT, AC, ORE);
-    if (Opts.isPhaseEnabled(PassPhase::PrintScopInfo)) {
-      if (Region *TLR = RI.getTopLevelRegion()) {
-        SmallVector<Region *> Regions;
-        addRegionIntoQueue(*TLR, Regions);
-
-        // reverse iteration because the regression tests expect it.
-        for (Region *R : reverse(Regions)) {
-          Scop *S = Info.getScop(R);
-          outs() << "Printing analysis 'Polly - Create polyhedral "
-                    "description of Scops' for region: '"
-                 << R->getNameStr() << "' in function '" << F.getName()
-                 << "':\n";
-          if (S)
-            outs() << *S;
-          else
-            outs() << "Invalid Scop!\n";
-        }
-      }
-    }
-
-    SmallPriorityWorklist<Region *, 4> Worklist;
-    for (auto &[R, S] : Info)
-      if (S)
-        Worklist.insert(R);
-
-    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
-    while (!Worklist.empty()) {
-      Region *R = Worklist.pop_back_val();
-      if (!SD.isMaxRegionInScop(*R, /*Verify=*/false))
-        continue;
-      Scop *S = Info.getScop(R);
-
-      // Phase: flatten
-      if (Opts.isPhaseEnabled(PassPhase::Flatten))
-        runFlattenSchedulePass(*S);
-
-      // Phase: deps
-      // Actual analysis runs on-demand, so it does not matter whether the phase
-      // is actually enabled, but use this location to print dependencies.
-      DependenceAnalysis::Result DA = runDependenceAnalysis(*S);
-      if (Opts.isPhaseEnabled(PassPhase::PrintDependences)) {
-        assert(Opts.isPhaseEnabled(PassPhase::Dependences));
-        const Dependences &D = DA.getDependences(Opts.PrintDepsAnalysisLevel);
-        D.print(outs());
-      }
-
-      // Phase: import-jscop
-      if (Opts.isPhaseEnabled(PassPhase::ImportJScop))
-        runImportJSON(*S, DA);
-
-      // Phase: simplify-0
-      bool ModifiedSinceSimplify = true;
-      if (Opts.isPhaseEnabled(PassPhase::Simplify0)) {
-        runSimplify(*S, 0);
-        ModifiedSinceSimplify = false;
-      }
-
-      // Phase: optree
-      if (Opts.isPhaseEnabled(PassPhase::Optree)) {
-        bool ModifiedByOptree = runForwardOpTree(*S);
-        ModifiedSinceSimplify |= ModifiedByOptree;
-      }
-
-      // Phase: delicm
-      if (Opts.isPhaseEnabled(PassPhase::DeLICM)) {
-        bool ModifiedByDelicm = runDeLICM(*S);
-        ModifiedSinceSimplify |= ModifiedByDelicm;
-      }
-
-      // Phase: simplify-1
-      // If we have already run simplify-0, do not re-run it if the SCoP has not
-      // changed since then.
-      if (ModifiedSinceSimplify && Opts.isPhaseEnabled(PassPhase::Simplify1)) {
-        runSimplify(*S, 1);
-        ModifiedSinceSimplify = false;
-      }
-
-      // Phase: dce
-      if (Opts.isPhaseEnabled(PassPhase::DeadCodeElimination))
-        runDeadCodeElim(*S, DA);
-
-      // Phase: mse
-      if (Opts.isPhaseEnabled(PassPhase::MaximumStaticExtension))
-        runMaximalStaticExpansion(*S, DA);
-
-      // Phase: prune
-      if (Opts.isPhaseEnabled(PassPhase::PruneUnprofitable))
-        runPruneUnprofitable(*S);
-
-      // Phase: opt-isl
-      if (Opts.isPhaseEnabled(PassPhase::Optimization))
-        runIslScheduleOptimizer(*S, &TTI, DA);
-
-      // Phase: import-jscop
-      if (Opts.isPhaseEnabled(PassPhase::ExportJScop))
-        runExportJSON(*S);
-
-      // Phase: ast
-      // Cannot run codegen unless ast is enabled
-      if (!Opts.isPhaseEnabled(PassPhase::AstGen))
-        continue;
-      std::unique_ptr<IslAstInfo> IslAst = runIslAstGen(*S, DA);
-
-      // Phase: codegen
-      if (!Opts.isPhaseEnabled(PassPhase::CodeGen))
-        continue;
-      bool ModifiedByCodeGen = runCodeGeneration(*S, RI, *IslAst);
-      if (ModifiedByCodeGen) {
-        ModifiedIR = true;
-
-        // For all regions, create new polly::Scop objects because the old ones
-        // refere to invalidated LLVM-IR.
-        // FIXME: Adds all SCoPs again to statistics
-        Info.recompute();
-      }
-    }
-
-    return ModifiedIR;
-  }
-};
-} // namespace
-
-StringRef polly::getPhaseName(PassPhase Phase) {
-  switch (Phase) {
-  case PassPhase::Prepare:
-    return "prepare";
-  case PassPhase::Detection:
-    return "detect";
-  case PassPhase::PrintDetect:
-    return "print-detect";
-  case PassPhase::DotScops:
-    return "dot-scops";
-  case PassPhase::DotScopsOnly:
-    return "dot-scops-only";
-  case PassPhase::ViewScops:
-    return "view-scops";
-  case PassPhase::ViewScopsOnly:
-    return "view-scops-only";
-  case PassPhase::ScopInfo:
-    return "scops";
-  case PassPhase::PrintScopInfo:
-    return "print-scops";
-  case PassPhase::Flatten:
-    return "flatten";
-  case PassPhase::Dependences:
-    return "deps";
-  case PassPhase::PrintDependences:
-    return "print-deps";
-  case PassPhase::ImportJScop:
-    return "import-jscop";
-  case PassPhase::Simplify0:
-    return "simplify-0";
-  case PassPhase::Optree:
-    return "optree";
-  case PassPhase::DeLICM:
-    return "delicm";
-  case PassPhase::Simplify1:
-    return "simplify-1";
-  case PassPhase::DeadCodeElimination:
-    return "dce";
-  case PassPhase::MaximumStaticExtension:
-    return "mse";
-  case PassPhase::PruneUnprofitable:
-    return "prune";
-  case PassPhase::Optimization:
-    return "opt-isl"; // "opt" would conflict with the llvm executable
-  case PassPhase::ExportJScop:
-    return "export-jscop";
-  case PassPhase::AstGen:
-    return "ast";
-  case PassPhase::CodeGen:
-    return "codegen";
-  default:
-    llvm_unreachable("Unexpected phase");
-  }
-}
-
-PassPhase polly::parsePhase(StringRef Name) {
-  return StringSwitch<PassPhase>(Name)
-      .Case("prepare", PassPhase::Prepare)
-      .Case("detect", PassPhase::Detection)
-      .Case("print-detect", PassPhase::PrintDetect)
-      .Case("dot-scops", PassPhase::DotScops)
-      .Case("dot-scops-only", PassPhase::DotScopsOnly)
-      .Case("view-scops", PassPhase::ViewScops)
-      .Case("view-scops-only", PassPhase::ViewScopsOnly)
-      .Case("scops", PassPhase::ScopInfo)
-      .Case("print-scops", PassPhase::PrintScopInfo)
-      .Case("flatten", PassPhase::Flatten)
-      .Case("deps", PassPhase::Dependences)
-      .Case("print-deps", PassPhase::PrintDependences)
-      .Case("import-jscop", PassPhase::ImportJScop)
-      .Case("simplify-0", PassPhase::Simplify0)
-      .Case("optree", PassPhase::Optree)
-      .Case("delicm", PassPhase::DeLICM)
-      .Case("simplify-1", PassPhase::Simplify1)
-      .Case("dce", PassPhase::DeadCodeElimination)
-      .Case("mse", PassPhase::MaximumStaticExtension)
-      .Case("prune", PassPhase::PruneUnprofitable)
-      .Case("opt-isl", PassPhase::Optimization)
-      .Case("export-jscop", PassPhase::ExportJScop)
-      .Case("ast", PassPhase::AstGen)
-      .Case("codegen", PassPhase::CodeGen)
-      .Default(PassPhase::None);
-}
-
-bool polly::dependsOnDependenceInfo(PassPhase Phase) {
-  // Nothing before dep phase can depend on it
-  if (static_cast<size_t>(Phase) <= static_cast<size_t>(PassPhase::Dependences))
-    return false;
-
-  switch (Phase) {
-  case PassPhase::Simplify0:
-  case PassPhase::Optree:
-  case PassPhase::DeLICM:
-  case PassPhase::Simplify1:
-  case PassPhase::PruneUnprofitable:
-  case PassPhase::ImportJScop:
-  case PassPhase::ExportJScop:
-  case PassPhase::AstGen: // transitively through codegen
-  case PassPhase::CodeGen:
-    return false;
-  default:
-    return true;
-  }
-}
-
-void PollyPassOptions::enableEnd2End() {
-  setPhaseEnabled(PassPhase::Detection);
-  setPhaseEnabled(PassPhase::ScopInfo);
-  setPhaseEnabled(PassPhase::Dependences);
-  setPhaseEnabled(PassPhase::AstGen);
-  setPhaseEnabled(PassPhase::CodeGen);
-}
-
-void PollyPassOptions::enableDefaultOpts() {
-  setPhaseEnabled(PassPhase::Prepare);
-  setPhaseEnabled(PassPhase::Simplify0);
-  setPhaseEnabled(PassPhase::Optree);
-  setPhaseEnabled(PassPhase::DeLICM);
-  setPhaseEnabled(PassPhase::Simplify1);
-  setPhaseEnabled(PassPhase::PruneUnprofitable);
-  setPhaseEnabled(PassPhase::Optimization);
-}
-
-void PollyPassOptions::disableAfter(PassPhase Phase) {
-  assert(Phase != PassPhase::None);
-  for (PassPhase P : enum_seq_inclusive(Phase, PassPhase::PassPhaseLast)) {
-    if (P == Phase)
-      continue;
-    setPhaseEnabled(P, false);
-  }
-}
-
-Error PollyPassOptions::checkConsistency() const {
-  for (PassPhase P : enum_seq_inclusive(PassPhase::PassPhaseFirst,
-                                        PassPhase::PassPhaseLast)) {
-    if (!isPhaseEnabled(P))
-      continue;
-
-    // Prepare and Detection have no requirements
-    if (P == PassPhase::Prepare || P == PassPhase::Detection)
-      continue;
-
-    if (!isPhaseEnabled(PassPhase::Detection))
-      return make_error<StringError>(
-          formatv("'{0}' requires 'detect' to be enabled", getPhaseName(P))
-              .str(),
-          inconvertibleErrorCode());
-
-    if (static_cast<size_t>(P) < static_cast<size_t>(PassPhase::ScopInfo))
-      continue;
-
-    if (!isPhaseEnabled(PassPhase::ScopInfo))
-      return make_error<StringError>(
-          formatv("'{0}' requires 'scops' to be enabled", getPhaseName(P))
-              .str(),
-          inconvertibleErrorCode());
-
-    if (dependsOnDependenceInfo(P) && !isPhaseEnabled(PassPhase::Dependences))
-      return make_error<StringError>(
-          formatv("'{0}' requires 'deps' to be enabled", getPhaseName(P)).str(),
-          inconvertibleErrorCode());
-  }
-
-  if (isPhaseEnabled(PassPhase::CodeGen) && !isPhaseEnabled(PassPhase::AstGen))
-    return make_error<StringError>("'codegen' requires 'ast' to be enabled",
-                                   inconvertibleErrorCode());
-
-  return Error::success();
-}
-
-bool polly::runPollyPass(Function &F, FunctionAnalysisManager &FAM,
-                         PollyPassOptions Opts) {
-  return PhaseManager(F, FAM, std::move(Opts)).run();
-}
diff --git a/polly/lib/Pass/PollyFunctionPass.cpp b/polly/lib/Pass/PollyFunctionPass.cpp
deleted file mode 100644
index a478e4df2ca20..0000000000000
--- a/polly/lib/Pass/PollyFunctionPass.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===------ PollyFunctionPass.cpp - Polly function pass  ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/Pass/PollyFunctionPass.h"
-
-using namespace llvm;
-using namespace polly;
-
-PreservedAnalyses PollyFunctionPass::run(llvm::Function &F,
-                                         llvm::FunctionAnalysisManager &FAM) {
-  bool ModifiedIR = runPollyPass(F, FAM, Opts);
-
-  // Be conservative about preserved analyses.
-  // FIXME: May also need to invalidate/update Module/CGSCC passes, but cannot
-  // reach them within a FunctionPassManager.
-  return ModifiedIR ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
diff --git a/polly/lib/Pass/PollyModulePass.cpp b/polly/lib/Pass/PollyModulePass.cpp
deleted file mode 100644
index f56ee672b76af..0000000000000
--- a/polly/lib/Pass/PollyModulePass.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//===------ PollyModulePass.cpp - Polly module pass  ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "polly/Pass/PollyModulePass.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-using namespace polly;
-
-PreservedAnalyses PollyModulePass::run(llvm::Module &M,
-                                       llvm::ModuleAnalysisManager &MAM) {
-  FunctionAnalysisManager &FAM =
-      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
-  bool ModifiedAnyIR = false;
-  for (Function &F : M) {
-    bool LocalModifiedIR = runPollyPass(F, FAM, Opts);
-    ModifiedAnyIR |= LocalModifiedIR;
-  }
-
-  // Be conservative about preserved analyses, especially if parallel functions
-  // have been outlined.
-  return ModifiedAnyIR ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
diff --git a/polly/lib/Support/DumpFunctionPass.cpp b/polly/lib/Support/DumpFunctionPass.cpp
index 9565e2156aee6..e47b7fe0db966 100644
--- a/polly/lib/Support/DumpFunctionPass.cpp
+++ b/polly/lib/Support/DumpFunctionPass.cpp
@@ -13,6 +13,7 @@
 #include "polly/Support/DumpFunctionPass.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -81,10 +82,50 @@ static void runDumpFunction(llvm::Function &F, StringRef Suffix) {
   Out->keep();
   LLVM_DEBUG(dbgs() << "Dump file " << Dumpfile << " written successfully\n");
 }
+
+class DumpFunctionWrapperPass final : public FunctionPass {
+private:
+  DumpFunctionWrapperPass(const DumpFunctionWrapperPass &) = delete;
+  const DumpFunctionWrapperPass &
+  operator=(const DumpFunctionWrapperPass &) = delete;
+
+  std::string Suffix;
+
+public:
+  static char ID;
+
+  explicit DumpFunctionWrapperPass() : FunctionPass(ID), Suffix("-dump") {}
+
+  explicit DumpFunctionWrapperPass(std::string Suffix)
+      : FunctionPass(ID), Suffix(std::move(Suffix)) {}
+
+  /// @name FunctionPass interface
+  //@{
+  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool runOnFunction(llvm::Function &F) override {
+    runDumpFunction(F, Suffix);
+    return false;
+  }
+  //@}
+};
+
+char DumpFunctionWrapperPass::ID;
 } // namespace
 
+FunctionPass *polly::createDumpFunctionWrapperPass(std::string Suffix) {
+  return new DumpFunctionWrapperPass(std::move(Suffix));
+}
+
 llvm::PreservedAnalyses DumpFunctionPass::run(Function &F,
                                               FunctionAnalysisManager &AM) {
   runDumpFunction(F, Suffix);
   return PreservedAnalyses::all();
 }
+
+INITIALIZE_PASS_BEGIN(DumpFunctionWrapperPass, "polly-dump-function",
+                      "Polly - Dump Function", false, false)
+INITIALIZE_PASS_END(DumpFunctionWrapperPass, "polly-dump-function",
+                    "Polly - Dump Function", false, false)
diff --git a/polly/lib/Support/DumpModulePass.cpp b/polly/lib/Support/DumpModulePass.cpp
index 2eaa0707fe571..c1c27ef6ac757 100644
--- a/polly/lib/Support/DumpModulePass.cpp
+++ b/polly/lib/Support/DumpModulePass.cpp
@@ -12,6 +12,7 @@
 
 #include "polly/Support/DumpModulePass.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -46,10 +47,56 @@ static void runDumpModule(llvm::Module &M, StringRef Filename, bool IsSuffix) {
   M.print(Out->os(), nullptr);
   Out->keep();
 }
+
+class DumpModuleWrapperPass final : public ModulePass {
+private:
+  DumpModuleWrapperPass(const DumpModuleWrapperPass &) = delete;
+  const DumpModuleWrapperPass &
+  operator=(const DumpModuleWrapperPass &) = delete;
+
+  std::string Filename;
+  bool IsSuffix;
+
+public:
+  static char ID;
+
+  /// This constructor is used e.g. if using opt -polly-dump-module.
+  ///
+  /// Provide a default suffix to not overwrite the original file.
+  explicit DumpModuleWrapperPass()
+      : ModulePass(ID), Filename("-dump"), IsSuffix(true) {}
+
+  explicit DumpModuleWrapperPass(std::string Filename, bool IsSuffix)
+      : ModulePass(ID), Filename(std::move(Filename)), IsSuffix(IsSuffix) {}
+
+  /// @name ModulePass interface
+  //@{
+  void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  bool runOnModule(llvm::Module &M) override {
+    runDumpModule(M, Filename, IsSuffix);
+    return false;
+  }
+  //@}
+};
+
+char DumpModuleWrapperPass::ID;
 } // namespace
 
+ModulePass *polly::createDumpModuleWrapperPass(std::string Filename,
+                                               bool IsSuffix) {
+  return new DumpModuleWrapperPass(std::move(Filename), IsSuffix);
+}
+
 llvm::PreservedAnalyses DumpModulePass::run(llvm::Module &M,
                                             llvm::ModuleAnalysisManager &AM) {
   runDumpModule(M, Filename, IsSuffix);
   return PreservedAnalyses::all();
 }
+
+INITIALIZE_PASS_BEGIN(DumpModuleWrapperPass, "polly-dump-module",
+                      "Polly - Dump Module", false, false)
+INITIALIZE_PASS_END(DumpModuleWrapperPass, "polly-dump-module",
+                    "Polly - Dump Module", false, false)
diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def
index 496839760a844..2c792a5867100 100644
--- a/polly/lib/Support/PollyPasses.def
+++ b/polly/lib/Support/PollyPasses.def
@@ -1,10 +1,3 @@
-#ifndef MODULE_PASS
-#define MODULE_PASS(NAME, CREATE_PASS, PARSER)
-#endif
-MODULE_PASS("polly", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyDefaultOptions)
-MODULE_PASS("polly-custom", createModuleToFunctionPassAdaptor(PollyFunctionPass(Opts)), parsePollyCustomOptions)
-#undef MODULE_PASS
-
 #ifndef CGSCC_PASS
 #define CGSCC_PASS(NAME, CREATE_PASS, PARSER)
 #endif
@@ -19,17 +12,15 @@ FUNCTION_ANALYSIS("polly-function-scops", ScopInfoAnalysis())
 #undef FUNCTION_ANALYSIS
 
 #ifndef FUNCTION_PASS
-#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER)
+#define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
-FUNCTION_PASS("polly-prepare", CodePreparationPass(), parseNoOptions)
-FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs()), parseNoOptions)
-FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs()), parseNoOptions)
-FUNCTION_PASS("polly-scop-viewer", ScopViewer(), parseNoOptions)
-FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer(), parseNoOptions)
-FUNCTION_PASS("polly-scop-printer", ScopPrinter(), parseNoOptions)
-FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter(), parseNoOptions)
-FUNCTION_PASS("polly", PollyFunctionPass(Opts), parsePollyDefaultOptions)
-FUNCTION_PASS("polly-custom", PollyFunctionPass(Opts), parsePollyCustomOptions)
+FUNCTION_PASS("polly-prepare", CodePreparationPass())
+FUNCTION_PASS("print<polly-detect>", ScopAnalysisPrinterPass(llvm::errs()))
+FUNCTION_PASS("print<polly-function-scops>", ScopInfoPrinterPass(llvm::errs()))
+FUNCTION_PASS("polly-scop-viewer", ScopViewer())
+FUNCTION_PASS("polly-scop-only-viewer", ScopOnlyViewer())
+FUNCTION_PASS("polly-scop-printer", ScopPrinter())
+FUNCTION_PASS("polly-scop-only-printer", ScopOnlyPrinter())
 #undef FUNCTION_PASS
 
 #ifndef SCOP_ANALYSIS
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 2f1d7a8362349..04f8715502c38 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -28,9 +28,8 @@
 #include "polly/DependenceInfo.h"
 #include "polly/ForwardOpTree.h"
 #include "polly/JSONExporter.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/MaximalStaticExpansion.h"
-#include "polly/Options.h"
-#include "polly/Pass/PollyFunctionPass.h"
 #include "polly/PruneUnprofitable.h"
 #include "polly/ScheduleOptimizer.h"
 #include "polly/ScopDetection.h"
@@ -53,8 +52,6 @@
 #include "llvm/Transforms/IPO.h"
 
 using namespace llvm;
-using namespace polly;
-
 namespace cl = llvm::cl;
 using namespace polly;
 
@@ -204,19 +201,58 @@ static cl::opt<bool> EnablePruneUnprofitable(
     cl::desc("Bail out on unprofitable SCoPs before rescheduling"), cl::Hidden,
     cl::init(true), cl::cat(PollyCategory));
 
-static cl::opt<bool>
-    PollyPrintDetect("polly-print-detect",
-                     cl::desc("Polly - Print static control parts (SCoPs)"),
-                     cl::cat(PollyCategory));
-
-static cl::opt<bool>
-    PollyPrintScops("polly-print-scops",
-                    cl::desc("Print polyhedral description of all regions"),
-                    cl::cat(PollyCategory));
+namespace {
 
-static cl::opt<bool> PollyPrintDeps("polly-print-deps",
-                                    cl::desc("Polly - Print dependences"),
-                                    cl::cat(PollyCategory));
+/// Initialize Polly passes when library is loaded.
+///
+/// We use the constructor of a statically declared object to initialize the
+/// different Polly passes right after the Polly library is loaded. This ensures
+/// that the Polly passes are available e.g. in the 'opt' tool.
+struct StaticInitializer {
+  StaticInitializer() {
+    llvm::PassRegistry &Registry = *llvm::PassRegistry::getPassRegistry();
+    polly::initializePollyPasses(Registry);
+  }
+};
+static StaticInitializer InitializeEverything;
+} // end of anonymous namespace.
+
+void initializePollyPasses(llvm::PassRegistry &Registry) {
+  initializeCodeGenerationPass(Registry);
+
+  initializeCodePreparationPass(Registry);
+  initializeDeadCodeElimWrapperPassPass(Registry);
+  initializeDependenceInfoPass(Registry);
+  initializeDependenceInfoPrinterLegacyPassPass(Registry);
+  initializeDependenceInfoWrapperPassPass(Registry);
+  initializeDependenceInfoPrinterLegacyFunctionPassPass(Registry);
+  initializeJSONExporterPass(Registry);
+  initializeJSONImporterPass(Registry);
+  initializeJSONImporterPrinterLegacyPassPass(Registry);
+  initializeMaximalStaticExpanderWrapperPassPass(Registry);
+  initializeIslAstInfoWrapperPassPass(Registry);
+  initializeIslAstInfoPrinterLegacyPassPass(Registry);
+  initializeIslScheduleOptimizerWrapperPassPass(Registry);
+  initializeIslScheduleOptimizerPrinterLegacyPassPass(Registry);
+  initializePollyCanonicalizePass(Registry);
+  initializeScopDetectionWrapperPassPass(Registry);
+  initializeScopDetectionPrinterLegacyPassPass(Registry);
+  initializeScopInlinerWrapperPassPass(Registry);
+  initializeScopInfoRegionPassPass(Registry);
+  initializeScopInfoPrinterLegacyRegionPassPass(Registry);
+  initializeScopInfoWrapperPassPass(Registry);
+  initializeScopInfoPrinterLegacyFunctionPassPass(Registry);
+  initializeFlattenSchedulePass(Registry);
+  initializeFlattenSchedulePrinterLegacyPassPass(Registry);
+  initializeForwardOpTreeWrapperPassPass(Registry);
+  initializeForwardOpTreePrinterLegacyPassPass(Registry);
+  initializeDeLICMWrapperPassPass(Registry);
+  initializeDeLICMPrinterLegacyPassPass(Registry);
+  initializeSimplifyWrapperPassPass(Registry);
+  initializeSimplifyPrinterLegacyPassPass(Registry);
+  initializeDumpModuleWrapperPassPass(Registry);
+  initializePruneUnprofitableWrapperPassPass(Registry);
+}
 
 static bool shouldEnablePollyForOptimization() { return PollyEnabled; }
 
@@ -230,198 +266,6 @@ static bool shouldEnablePollyForDiagnostic() {
          ExportJScop;
 }
 
-/// Parser of parameters for LoopVectorize pass.
-static llvm::Expected<PollyPassOptions> parsePollyOptions(StringRef Params,
-                                                          bool IsCustom) {
-  PassPhase PrevPhase = PassPhase::None;
-
-  bool EnableDefaultOpts = !IsCustom;
-  bool EnableEnd2End = !IsCustom;
-  std::optional<bool>
-      PassEnabled[static_cast<size_t>(PassPhase::PassPhaseLast) + 1];
-  PassPhase StopAfter = PassPhase::None;
-
-  // Passes enabled using command-line flags (can be overridden using
-  // 'polly<no-pass>')
-  if (PollyPrintDetect)
-    PassEnabled[static_cast<size_t>(PassPhase::PrintDetect)] = true;
-  if (PollyPrintScops)
-    PassEnabled[static_cast<size_t>(PassPhase::PrintScopInfo)] = true;
-  if (PollyPrintDeps)
-    PassEnabled[static_cast<size_t>(PassPhase::PrintDependences)] = true;
-
-  if (PollyViewer)
-    PassEnabled[static_cast<size_t>(PassPhase::ViewScops)] = true;
-  if (PollyOnlyViewer)
-    PassEnabled[static_cast<size_t>(PassPhase::ViewScopsOnly)] = true;
-  if (PollyPrinter)
-    PassEnabled[static_cast<size_t>(PassPhase::DotScops)] = true;
-  if (PollyOnlyPrinter)
-    PassEnabled[static_cast<size_t>(PassPhase::DotScopsOnly)] = true;
-  if (!EnableSimplify)
-    PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = false;
-  if (!EnableForwardOpTree)
-    PassEnabled[static_cast<size_t>(PassPhase::Optree)] = false;
-  if (!EnableDeLICM)
-    PassEnabled[static_cast<size_t>(PassPhase::DeLICM)] = false;
-  if (!EnableSimplify)
-    PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = false;
-  if (ImportJScop)
-    PassEnabled[static_cast<size_t>(PassPhase::ImportJScop)] = true;
-  if (DeadCodeElim)
-    PassEnabled[static_cast<size_t>(PassPhase::DeadCodeElimination)] = true;
-  if (FullyIndexedStaticExpansion)
-    PassEnabled[static_cast<size_t>(PassPhase::MaximumStaticExtension)] = true;
-  if (!EnablePruneUnprofitable)
-    PassEnabled[static_cast<size_t>(PassPhase::PruneUnprofitable)] = false;
-  switch (Optimizer) {
-  case OPTIMIZER_NONE:
-    // explicitly switched off
-    PassEnabled[static_cast<size_t>(PassPhase::Optimization)] = false;
-    break;
-  case OPTIMIZER_ISL:
-    // default: enabled
-    break;
-  }
-  if (ExportJScop)
-    PassEnabled[static_cast<size_t>(PassPhase::ExportJScop)] = true;
-  switch (CodeGeneration) {
-  case CODEGEN_AST:
-    PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = true;
-    PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false;
-    break;
-  case CODEGEN_FULL:
-    // default: ast and codegen enabled
-    break;
-  case CODEGEN_NONE:
-    PassEnabled[static_cast<size_t>(PassPhase::AstGen)] = false;
-    PassEnabled[static_cast<size_t>(PassPhase::CodeGen)] = false;
-    break;
-  }
-
-  while (!Params.empty()) {
-    StringRef Param;
-    std::tie(Param, Params) = Params.split(';');
-    auto [ParamName, ParamVal] = Param.split('=');
-
-    if (ParamName == "stopafter") {
-      StopAfter = parsePhase(ParamVal);
-      if (StopAfter == PassPhase::None)
-        return make_error<StringError>(
-            formatv("invalid stopafter parameter value '{0}'", ParamVal).str(),
-            inconvertibleErrorCode());
-      continue;
-    }
-
-    if (!ParamVal.empty())
-      return make_error<StringError>(
-          formatv("parameter '{0}' does not take value", ParamName).str(),
-          inconvertibleErrorCode());
-
-    bool Enabled = true;
-    if (ParamName.starts_with("no-")) {
-      Enabled = false;
-      ParamName = ParamName.drop_front(3);
-    }
-
-    if (ParamName == "default-opts") {
-      EnableDefaultOpts = Enabled;
-      continue;
-    }
-
-    if (ParamName == "end2end") {
-      EnableEnd2End = Enabled;
-      continue;
-    }
-
-    PassPhase Phase;
-
-    // Shortcut for both simplifys at the same time
-    if (ParamName == "simplify") {
-      PassEnabled[static_cast<size_t>(PassPhase::Simplify0)] = Enabled;
-      PassEnabled[static_cast<size_t>(PassPhase::Simplify1)] = Enabled;
-      Phase = PassPhase::Simplify0;
-    } else {
-      Phase = parsePhase(ParamName);
-      if (Phase == PassPhase::None)
-        return make_error<StringError>(
-            formatv("invalid Polly parameter/phase name '{0}'", ParamName)
-                .str(),
-            inconvertibleErrorCode());
-
-      if (PrevPhase >= Phase)
-        return make_error<StringError>(
-            formatv("phases must not be repeated and enumerated in-order: "
-                    "'{0}' listed before '{1}'",
-                    getPhaseName(PrevPhase), getPhaseName(Phase))
-                .str(),
-            inconvertibleErrorCode());
-
-      PassEnabled[static_cast<size_t>(Phase)] = Enabled;
-    }
-    PrevPhase = Phase;
-  }
-
-  PollyPassOptions Opts;
-  Opts.ViewAll = ViewAll;
-  Opts.ViewFilter = ViewFilter;
-  Opts.PrintDepsAnalysisLevel = OptAnalysisLevel;
-
-  // Implicitly enable dependent phases first. May be overriden explicitly
-  // on/off later.
-  for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst,
-                                              PassPhase::PassPhaseLast)) {
-    bool Enabled = PassEnabled[static_cast<size_t>(P)].value_or(false);
-    if (!Enabled)
-      continue;
-
-    if (static_cast<size_t>(PassPhase::Detection) < static_cast<size_t>(P))
-      Opts.setPhaseEnabled(PassPhase::Detection);
-
-    if (static_cast<size_t>(PassPhase::ScopInfo) < static_cast<size_t>(P))
-      Opts.setPhaseEnabled(PassPhase::ScopInfo);
-
-    if (dependsOnDependenceInfo(P))
-      Opts.setPhaseEnabled(PassPhase::Dependences);
-
-    if (static_cast<size_t>(PassPhase::AstGen) < static_cast<size_t>(P))
-      Opts.setPhaseEnabled(PassPhase::AstGen);
-  }
-
-  if (EnableEnd2End)
-    Opts.enableEnd2End();
-
-  if (EnableDefaultOpts)
-    Opts.enableDefaultOpts();
-
-  for (PassPhase P : llvm::enum_seq_inclusive(PassPhase::PassPhaseFirst,
-                                              PassPhase::PassPhaseLast)) {
-    std::optional<bool> Enabled = PassEnabled[static_cast<size_t>(P)];
-
-    // Apply only if set explicitly.
-    if (Enabled.has_value())
-      Opts.setPhaseEnabled(P, *Enabled);
-  }
-
-  if (StopAfter != PassPhase::None)
-    Opts.disableAfter(StopAfter);
-
-  if (Error CheckResult = Opts.checkConsistency())
-    return CheckResult;
-
-  return Opts;
-}
-
-static llvm::Expected<PollyPassOptions>
-parsePollyDefaultOptions(StringRef Params) {
-  return parsePollyOptions(Params, false);
-}
-
-static llvm::Expected<PollyPassOptions>
-parsePollyCustomOptions(StringRef Params) {
-  return parsePollyOptions(Params, true);
-}
-
 /// Register Polly passes such that they form a polyhedral optimizer.
 ///
 /// The individual Polly passes are registered in the pass manager such that
@@ -461,12 +305,77 @@ static void buildCommonPollyPipeline(FunctionPassManager &PM,
                                      OptimizationLevel Level,
                                      bool EnableForOpt) {
   PassBuilder PB;
+  ScopPassManager SPM;
+
+  PM.addPass(CodePreparationPass());
+
+  // TODO add utility passes for the various command line options, once they're
+  // ported
+
+  if (PollyDetectOnly) {
+    // Don't add more passes other than the ScopPassManager's detection passes.
+    PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM)));
+    return;
+  }
 
-  ExitOnError Err("Inconsistent Polly configuration: ");
-  PollyPassOptions &&Opts =
-      Err(parsePollyOptions(StringRef(), /*IsCustom=*/false));
-  PM.addPass(PollyFunctionPass(Opts));
+  if (PollyViewer)
+    PM.addPass(ScopViewer());
+  if (PollyOnlyViewer)
+    PM.addPass(ScopOnlyViewer());
+  if (PollyPrinter)
+    PM.addPass(ScopPrinter());
+  if (PollyOnlyPrinter)
+    PM.addPass(ScopOnlyPrinter());
+  if (EnableSimplify)
+    SPM.addPass(SimplifyPass(0));
+  if (EnableForwardOpTree)
+    SPM.addPass(ForwardOpTreePass());
+  if (EnableDeLICM)
+    SPM.addPass(DeLICMPass());
+  if (EnableSimplify)
+    SPM.addPass(SimplifyPass(1));
+
+  if (ImportJScop)
+    SPM.addPass(JSONImportPass());
+
+  if (DeadCodeElim)
+    SPM.addPass(DeadCodeElimPass());
+
+  if (FullyIndexedStaticExpansion)
+    SPM.addPass(MaximalStaticExpansionPass());
+
+  if (EnablePruneUnprofitable)
+    SPM.addPass(PruneUnprofitablePass());
+
+  switch (Optimizer) {
+  case OPTIMIZER_NONE:
+    break; /* Do nothing */
+  case OPTIMIZER_ISL:
+    SPM.addPass(IslScheduleOptimizerPass());
+    break;
+  }
+
+  if (ExportJScop)
+    SPM.addPass(JSONExportPass());
+
+  if (!EnableForOpt)
+    return;
+
+  switch (CodeGeneration) {
+  case CODEGEN_AST:
+    SPM.addPass(
+        llvm::RequireAnalysisPass<IslAstAnalysis, Scop, ScopAnalysisManager,
+                                  ScopStandardAnalysisResults &,
+                                  SPMUpdater &>());
+    break;
+  case CODEGEN_FULL:
+    SPM.addPass(CodeGenerationPass());
+    break;
+  case CODEGEN_NONE:
+    break;
+  }
 
+  PM.addPass(createFunctionToScopPassAdaptor(std::move(SPM)));
   PM.addPass(PB.buildFunctionSimplificationPipeline(
       Level, llvm::ThinOrFullLTOPhase::None)); // Cleanup
 
@@ -583,9 +492,8 @@ parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM,
   return false;
 }
 
-static llvm::Expected<bool>
+static bool
 parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM,
-                      PassInstrumentationCallbacks *PIC,
                       ArrayRef<PassBuilder::PipelineElement> Pipeline) {
   if (llvm::parseAnalysisUtilityPasses<OwningScopAnalysisManagerFunctionProxy>(
           "polly-scop-analyses", Name, FPM))
@@ -597,13 +505,8 @@ parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM,
                                                               FPM))            \
     return true;
 
-#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER)                               \
-  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
-    auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME);  \
-    if (!ExpectedOpts)                                                         \
-      return ExpectedOpts.takeError();                                         \
-    auto &&Opts = *ExpectedOpts;                                               \
-    (void)Opts;                                                                \
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
     return true;                                                               \
   }
@@ -689,28 +592,6 @@ parseTopLevelPipeline(llvm::ModulePassManager &MPM,
   return true;
 }
 
-static llvm::Expected<bool>
-parseModulePipeline(StringRef Name, llvm::ModulePassManager &MPM,
-                    PassInstrumentationCallbacks *PIC,
-                    ArrayRef<PassBuilder::PipelineElement> Pipeline) {
-  assert(Pipeline.empty());
-
-#define MODULE_PASS(NAME, CREATE_PASS, PARSER)                                 \
-  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
-    auto ExpectedOpts = PassBuilder::parsePassParameters(PARSER, Name, NAME);  \
-    if (!ExpectedOpts)                                                         \
-      return ExpectedOpts.takeError();                                         \
-    auto &&Opts = *ExpectedOpts;                                               \
-    (void)Opts;                                                                \
-    MPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
-  }
-
-#include "PollyPasses.def"
-
-  return false;
-}
-
 /// Register Polly to be available as an optimizer
 ///
 ///
@@ -739,36 +620,10 @@ parseModulePipeline(StringRef Name, llvm::ModulePassManager &MPM,
 /// handle LICMed code to make it useful.
 void registerPollyPasses(PassBuilder &PB) {
   PassInstrumentationCallbacks *PIC = PB.getPassInstrumentationCallbacks();
-
-#define MODULE_PASS(NAME, CREATE_PASS, PARSER)                                 \
-  {                                                                            \
-    std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts;              \
-    (void)Opts;                                                                \
-    PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);              \
-  }
-#define CGSCC_PASS(NAME, CREATE_PASS, PARSER)                                  \
-  {                                                                            \
-    std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts;              \
-    (void)Opts;                                                                \
-    PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);              \
-  }
-#define FUNCTION_PASS(NAME, CREATE_PASS, PARSER)                               \
-  {                                                                            \
-    std::remove_reference_t<decltype(*PARSER(StringRef()))> Opts;              \
-    (void)Opts;                                                                \
-    PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);              \
-  }
-#include "PollyPasses.def"
-
   PB.registerAnalysisRegistrationCallback([PIC](FunctionAnalysisManager &FAM) {
     registerFunctionAnalyses(FAM, PIC);
   });
-  PB.registerPipelineParsingCallback(
-      [PIC](StringRef Name, FunctionPassManager &FPM,
-            ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
-        ExitOnError Err("Unable to parse Polly module pass: ");
-        return Err(parseFunctionPipeline(Name, FPM, PIC, Pipeline));
-      });
+  PB.registerPipelineParsingCallback(parseFunctionPipeline);
   PB.registerPipelineParsingCallback(
       [PIC](StringRef Name, FunctionPassManager &FPM,
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
@@ -780,12 +635,6 @@ void registerPollyPasses(PassBuilder &PB) {
         ExitOnError Err("Unable to parse Polly call graph pass: ");
         return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline));
       });
-  PB.registerPipelineParsingCallback(
-      [PIC](StringRef Name, ModulePassManager &MPM,
-            ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
-        ExitOnError Err("Unable to parse Polly module pass: ");
-        return Err(parseModulePipeline(Name, MPM, PIC, Pipeline));
-      });
   PB.registerParseTopLevelPipelineCallback(
       [PIC](llvm::ModulePassManager &MPM,
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index cf0ec4432f747..a2328d1bbb3cf 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -206,6 +206,18 @@ void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, DominatorTree *DT,
   splitBlock(EntryBlock, I, DT, LI, RI);
 }
 
+void polly::splitEntryBlockForAlloca(BasicBlock *EntryBlock, Pass *P) {
+  auto *DTWP = P->getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto *LIWP = P->getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  RegionInfoPass *RIP = P->getAnalysisIfAvailable<RegionInfoPass>();
+  RegionInfo *RI = RIP ? &RIP->getRegionInfo() : nullptr;
+
+  // splitBlock updates DT, LI and RI.
+  polly::splitEntryBlockForAlloca(EntryBlock, DT, LI, RI);
+}
+
 void polly::recordAssumption(polly::RecordedAssumptionsTy *RecordedAssumptions,
                              polly::AssumptionKind Kind, isl::set Set,
                              DebugLoc Loc, polly::AssumptionSign Sign,
diff --git a/polly/lib/Transform/Canonicalization.cpp b/polly/lib/Transform/Canonicalization.cpp
index cd7195f5374df..1be560e64af40 100644
--- a/polly/lib/Transform/Canonicalization.cpp
+++ b/polly/lib/Transform/Canonicalization.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/Canonicalization.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -38,6 +39,24 @@ static cl::opt<bool>
                  cl::desc("Run an early inliner pass before Polly"), cl::Hidden,
                  cl::cat(PollyCategory));
 
+void polly::registerCanonicalicationPasses(llvm::legacy::PassManagerBase &PM) {
+  bool UseMemSSA = true;
+  PM.add(llvm::createPromoteMemoryToRegisterPass());
+  PM.add(llvm::createEarlyCSEPass(UseMemSSA));
+  PM.add(llvm::createInstructionCombiningPass());
+  PM.add(llvm::createCFGSimplificationPass());
+  PM.add(llvm::createTailCallEliminationPass());
+  PM.add(llvm::createCFGSimplificationPass());
+  PM.add(llvm::createReassociatePass());
+  if (PollyInliner) {
+    PM.add(llvm::createPromoteMemoryToRegisterPass());
+    PM.add(llvm::createCFGSimplificationPass());
+    PM.add(llvm::createInstructionCombiningPass());
+    PM.add(createBarrierNoopPass());
+  }
+  PM.add(llvm::createInstructionCombiningPass());
+}
+
 /// Adapted from llvm::PassBuilder::buildInlinerPipeline
 static ModuleInlinerWrapperPass
 buildInlinePasses(llvm::OptimizationLevel Level) {
@@ -106,3 +125,49 @@ polly::buildCanonicalicationPassesForNPM(llvm::ModulePassManager &MPM,
 
   return FPM;
 }
+
+namespace {
+class PollyCanonicalize final : public ModulePass {
+  PollyCanonicalize(const PollyCanonicalize &) = delete;
+  const PollyCanonicalize &operator=(const PollyCanonicalize &) = delete;
+
+public:
+  static char ID;
+
+  explicit PollyCanonicalize() : ModulePass(ID) {}
+  ~PollyCanonicalize();
+
+  /// @name FunctionPass interface.
+  //@{
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  bool runOnModule(Module &M) override;
+  void print(raw_ostream &OS, const Module *) const override;
+  //@}
+};
+} // namespace
+
+PollyCanonicalize::~PollyCanonicalize() {}
+
+void PollyCanonicalize::getAnalysisUsage(AnalysisUsage &AU) const {}
+
+void PollyCanonicalize::releaseMemory() {}
+
+bool PollyCanonicalize::runOnModule(Module &M) {
+  legacy::PassManager PM;
+  registerCanonicalicationPasses(PM);
+  PM.run(M);
+
+  return true;
+}
+
+void PollyCanonicalize::print(raw_ostream &OS, const Module *) const {}
+
+char PollyCanonicalize::ID = 0;
+
+Pass *polly::createPollyCanonicalizePass() { return new PollyCanonicalize(); }
+
+INITIALIZE_PASS_BEGIN(PollyCanonicalize, "polly-canonicalize",
+                      "Polly - Run canonicalization passes", false, false)
+INITIALIZE_PASS_END(PollyCanonicalize, "polly-canonicalize",
+                    "Polly - Run canonicalization passes", false, false)
diff --git a/polly/lib/Transform/CodePreparation.cpp b/polly/lib/Transform/CodePreparation.cpp
index 5b96c865ad80f..d045fb6b62c90 100644
--- a/polly/lib/Transform/CodePreparation.cpp
+++ b/polly/lib/Transform/CodePreparation.cpp
@@ -16,11 +16,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/CodePreparation.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Support/ScopHelper.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/InitializePasses.h"
 
 using namespace llvm;
 using namespace polly;
@@ -45,6 +47,32 @@ static bool runCodePreprationImpl(Function &F, DominatorTree *DT, LoopInfo *LI,
   return true;
 }
 
+namespace {
+
+/// Prepare the IR for the scop detection.
+///
+class CodePreparation final : public FunctionPass {
+  CodePreparation(const CodePreparation &) = delete;
+  const CodePreparation &operator=(const CodePreparation &) = delete;
+
+  void clear();
+
+public:
+  static char ID;
+
+  explicit CodePreparation() : FunctionPass(ID) {}
+  ~CodePreparation();
+
+  /// @name FunctionPass interface.
+  //@{
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  bool runOnFunction(Function &F) override;
+  void print(raw_ostream &OS, const Module *) const override;
+  //@}
+};
+} // namespace
+
 PreservedAnalyses CodePreparationPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
@@ -58,3 +86,44 @@ PreservedAnalyses CodePreparationPass::run(Function &F,
   PA.preserve<LoopAnalysis>();
   return PA;
 }
+
+void CodePreparation::clear() {}
+
+CodePreparation::~CodePreparation() { clear(); }
+
+void CodePreparation::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfoWrapperPass>();
+
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<RegionInfoPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominanceFrontierWrapperPass>();
+}
+
+bool CodePreparation::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  RegionInfo *RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
+
+  runCodePreprationImpl(F, DT, LI, RI);
+
+  return true;
+}
+
+void CodePreparation::releaseMemory() { clear(); }
+
+void CodePreparation::print(raw_ostream &OS, const Module *) const {}
+
+char CodePreparation::ID = 0;
+char &polly::CodePreparationID = CodePreparation::ID;
+
+Pass *polly::createCodePreparationPass() { return new CodePreparation(); }
+
+INITIALIZE_PASS_BEGIN(CodePreparation, "polly-prepare",
+                      "Polly - Prepare code for polly", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(CodePreparation, "polly-prepare",
+                    "Polly - Prepare code for polly", false, false)
diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp
index e8f2d951404f3..9a9768afe113e 100644
--- a/polly/lib/Transform/DeLICM.cpp
+++ b/polly/lib/Transform/DeLICM.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/DeLICM.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
@@ -24,6 +25,7 @@
 #include "polly/ZoneAlgo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
 
 #include "polly/Support/PollyDebug.h"
 #define DEBUG_TYPE "polly-delicm"
@@ -33,10 +35,6 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<bool> PollyPrintDeLICM("polly-print-delicm",
-                                      cl::desc("Polly - Print DeLICM/DePRE"),
-                                      cl::cat(PollyCategory));
-
 cl::opt<int>
     DelicmMaxOps("polly-delicm-max-ops",
                  cl::desc("Maximum number of isl operations to invest for "
@@ -1358,10 +1356,7 @@ class DeLICMImpl final : public ZoneAlgorithm {
   }
 
   /// Return whether at least one transformation been applied.
-  bool isModified() const {
-    return NumberOfTargetsMapped > 0 || NumberOfMappedValueScalars > 0 ||
-           NumberOfMappedPHIScalars > 0;
-  }
+  bool isModified() const { return NumberOfTargetsMapped > 0; }
 };
 
 static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) {
@@ -1381,7 +1376,7 @@ static std::unique_ptr<DeLICMImpl> collapseToUnused(Scop &S, LoopInfo &LI) {
   return Impl;
 }
 
-static std::unique_ptr<DeLICMImpl> runDeLICMImpl(Scop &S, LoopInfo &LI) {
+static std::unique_ptr<DeLICMImpl> runDeLICM(Scop &S, LoopInfo &LI) {
   std::unique_ptr<DeLICMImpl> Impl = collapseToUnused(S, LI);
 
   Scop::ScopStatistics ScopStats = S.getStatistics();
@@ -1399,7 +1394,7 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                                            ScopStandardAnalysisResults &SAR,
                                            SPMUpdater &U, raw_ostream *OS) {
   LoopInfo &LI = SAR.LI;
-  std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI);
+  std::unique_ptr<DeLICMImpl> Impl = runDeLICM(S, LI);
 
   if (OS) {
     *OS << "Printing analysis 'Polly - DeLICM/DePRE' for region: '"
@@ -1422,8 +1417,88 @@ static PreservedAnalyses runDeLICMUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   PA.preserveSet<AllAnalysesOn<Loop>>();
   return PA;
 }
+
+class DeLICMWrapperPass final : public ScopPass {
+private:
+  DeLICMWrapperPass(const DeLICMWrapperPass &) = delete;
+  const DeLICMWrapperPass &operator=(const DeLICMWrapperPass &) = delete;
+
+  /// The pass implementation, also holding per-scop data.
+  std::unique_ptr<DeLICMImpl> Impl;
+
+public:
+  static char ID;
+  explicit DeLICMWrapperPass() : ScopPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredTransitive<ScopInfoRegionPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnScop(Scop &S) override {
+    // Free resources for previous scop's computation, if not yet done.
+    releaseMemory();
+
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    Impl = runDeLICM(S, LI);
+
+    return Impl->isModified();
+  }
+
+  void printScop(raw_ostream &OS, Scop &S) const override {
+    if (!Impl)
+      return;
+    assert(Impl->getScop() == &S);
+
+    OS << "DeLICM result:\n";
+    Impl->print(OS);
+  }
+
+  void releaseMemory() override { Impl.reset(); }
+};
+
+char DeLICMWrapperPass::ID;
+
+/// Print result from DeLICMWrapperPass.
+class DeLICMPrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  DeLICMPrinterLegacyPass() : DeLICMPrinterLegacyPass(outs()) {}
+  explicit DeLICMPrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    DeLICMWrapperPass &P = getAnalysis<DeLICMWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<DeLICMWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char DeLICMPrinterLegacyPass::ID = 0;
 } // anonymous namespace
 
+Pass *polly::createDeLICMWrapperPass() { return new DeLICMWrapperPass(); }
+
+llvm::Pass *polly::createDeLICMPrinterLegacyPass(llvm::raw_ostream &OS) {
+  return new DeLICMPrinterLegacyPass(OS);
+}
+
 llvm::PreservedAnalyses polly::DeLICMPass::run(Scop &S,
                                                ScopAnalysisManager &SAM,
                                                ScopStandardAnalysisResults &SAR,
@@ -1452,21 +1527,15 @@ bool polly::isConflicting(
   return Knowledge::isConflicting(Existing, Proposed, OS, Indent);
 }
 
-bool polly::runDeLICM(Scop &S) {
-  LoopInfo &LI = *S.getLI();
-  std::unique_ptr<DeLICMImpl> Impl = runDeLICMImpl(S, LI);
-
-  if (PollyPrintDeLICM) {
-    outs() << "Printing analysis 'Polly - DeLICM/DePRE' for region: '"
-           << S.getName() << "' in function '" << S.getFunction().getName()
-           << "':\n";
-    if (Impl) {
-      assert(Impl->getScop() == &S);
-
-      outs() << "DeLICM result:\n";
-      Impl->print(outs());
-    }
-  }
-
-  return Impl->isModified();
-}
+INITIALIZE_PASS_BEGIN(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(DeLICMWrapperPass, "polly-delicm", "Polly - DeLICM/DePRE",
+                    false, false)
+
+INITIALIZE_PASS_BEGIN(DeLICMPrinterLegacyPass, "polly-print-delicm",
+                      "Polly - Print DeLICM/DePRE", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScopInfoWrapperPass)
+INITIALIZE_PASS_END(DeLICMPrinterLegacyPass, "polly-print-delicm",
+                    "Polly - Print DeLICM/DePRE", false, false)
diff --git a/polly/lib/Transform/DeadCodeElimination.cpp b/polly/lib/Transform/DeadCodeElimination.cpp
index df95e5190431c..5cb89fec09fe8 100644
--- a/polly/lib/Transform/DeadCodeElimination.cpp
+++ b/polly/lib/Transform/DeadCodeElimination.cpp
@@ -33,6 +33,7 @@
 
 #include "polly/DeadCodeElimination.h"
 #include "polly/DependenceInfo.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -50,6 +51,20 @@ cl::opt<int> DCEPreciseSteps(
              "before the actual dead code elimination."),
     cl::init(-1), cl::cat(PollyCategory));
 
+class DeadCodeElimWrapperPass final : public ScopPass {
+public:
+  static char ID;
+  explicit DeadCodeElimWrapperPass() : ScopPass(ID) {}
+
+  /// Remove dead iterations from the schedule of @p S.
+  bool runOnScop(Scop &S) override;
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+char DeadCodeElimWrapperPass::ID = 0;
+
 /// Return the set of live iterations.
 ///
 /// The set of live iterations are all iterations that write to memory and for
@@ -129,19 +144,29 @@ static bool runDeadCodeElimination(Scop &S, int PreciseSteps,
   return S.restrictDomains(Live);
 }
 
-} // namespace
-
-bool polly::runDeadCodeElim(Scop &S, DependenceAnalysis::Result &DA) {
-  const Dependences &Deps = DA.getDependences(Dependences::AL_Statement);
+bool DeadCodeElimWrapperPass::runOnScop(Scop &S) {
+  auto &DI = getAnalysis<DependenceInfo>();
+  const Dependences &Deps = DI.getDependences(Dependences::AL_Statement);
 
   bool Changed = runDeadCodeElimination(S, DCEPreciseSteps, Deps);
 
   // FIXME: We can probably avoid the recomputation of all dependences by
   // updating them explicitly.
   if (Changed)
-    DA.recomputeDependences(Dependences::AL_Statement);
+    DI.recomputeDependences(Dependences::AL_Statement);
 
-  return Changed;
+  return false;
+}
+
+void DeadCodeElimWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  ScopPass::getAnalysisUsage(AU);
+  AU.addRequired<DependenceInfo>();
+}
+
+} // namespace
+
+Pass *polly::createDeadCodeElimWrapperPass() {
+  return new DeadCodeElimWrapperPass();
 }
 
 llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM,
@@ -166,3 +191,10 @@ llvm::PreservedAnalyses DeadCodeElimPass::run(Scop &S, ScopAnalysisManager &SAM,
   PA.preserveSet<AllAnalysesOn<Loop>>();
   return PA;
 }
+
+INITIALIZE_PASS_BEGIN(DeadCodeElimWrapperPass, "polly-dce",
+                      "Polly - Remove dead iterations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo)
+INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass)
+INITIALIZE_PASS_END(DeadCodeElimWrapperPass, "polly-dce",
+                    "Polly - Remove dead iterations", false, false)
diff --git a/polly/lib/Transform/FlattenSchedule.cpp b/polly/lib/Transform/FlattenSchedule.cpp
index 35a8ce6877036..f514ef359ba07 100644
--- a/polly/lib/Transform/FlattenSchedule.cpp
+++ b/polly/lib/Transform/FlattenSchedule.cpp
@@ -14,7 +14,6 @@
 
 #include "polly/FlattenSchedule.h"
 #include "polly/FlattenAlgo.h"
-#include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Support/ISLOStream.h"
@@ -27,10 +26,6 @@ using namespace llvm;
 
 namespace {
 
-static cl::opt<bool> PollyPrintFlattenSchedule("polly-print-flatten-schedule",
-                                               cl::desc("A polly pass"),
-                                               cl::cat(PollyCategory));
-
 /// Print a schedule to @p OS.
 ///
 /// Prints the schedule for each statements on a new line.
@@ -39,45 +34,119 @@ void printSchedule(raw_ostream &OS, const isl::union_map &Schedule,
   for (isl::map Map : Schedule.get_map_list())
     OS.indent(indent) << Map << "\n";
 }
-} // namespace
 
-void polly::runFlattenSchedulePass(Scop &S) {
-  // Keep a reference to isl_ctx to ensure that it is not freed before we free
-  // OldSchedule.
-  auto IslCtx = S.getSharedIslCtx();
+/// Flatten the schedule stored in an polly::Scop.
+class FlattenSchedule final : public ScopPass {
+private:
+  FlattenSchedule(const FlattenSchedule &) = delete;
+  const FlattenSchedule &operator=(const FlattenSchedule &) = delete;
+
+  std::shared_ptr<isl_ctx> IslCtx;
+  isl::union_map OldSchedule;
+
+public:
+  static char ID;
+  explicit FlattenSchedule() : ScopPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredTransitive<ScopInfoRegionPass>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnScop(Scop &S) override {
+    // Keep a reference to isl_ctx to ensure that it is not freed before we free
+    // OldSchedule.
+    IslCtx = S.getSharedIslCtx();
 
-  POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n");
-  auto OldSchedule = S.getSchedule();
-  POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2));
+    POLLY_DEBUG(dbgs() << "Going to flatten old schedule:\n");
+    OldSchedule = S.getSchedule();
+    POLLY_DEBUG(printSchedule(dbgs(), OldSchedule, 2));
 
-  auto Domains = S.getDomains();
-  auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains);
-  POLLY_DEBUG(dbgs() << "Old schedule with domains:\n");
-  POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2));
+    auto Domains = S.getDomains();
+    auto RestrictedOldSchedule = OldSchedule.intersect_domain(Domains);
+    POLLY_DEBUG(dbgs() << "Old schedule with domains:\n");
+    POLLY_DEBUG(printSchedule(dbgs(), RestrictedOldSchedule, 2));
 
-  auto NewSchedule = flattenSchedule(RestrictedOldSchedule);
+    auto NewSchedule = flattenSchedule(RestrictedOldSchedule);
 
-  POLLY_DEBUG(dbgs() << "Flattened new schedule:\n");
-  POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
+    POLLY_DEBUG(dbgs() << "Flattened new schedule:\n");
+    POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
 
-  NewSchedule = NewSchedule.gist_domain(Domains);
-  POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n");
-  POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
+    NewSchedule = NewSchedule.gist_domain(Domains);
+    POLLY_DEBUG(dbgs() << "Gisted, flattened new schedule:\n");
+    POLLY_DEBUG(printSchedule(dbgs(), NewSchedule, 2));
 
-  S.setSchedule(NewSchedule);
+    S.setSchedule(NewSchedule);
+    return false;
+  }
 
-  if (PollyPrintFlattenSchedule) {
-    outs()
-        << "Printing analysis 'Polly - Print flattened schedule' for region: '"
-        << S.getRegion().getNameStr() << "' in function '"
-        << S.getFunction().getName() << "':\n";
+  void printScop(raw_ostream &OS, Scop &S) const override {
+    OS << "Schedule before flattening {\n";
+    printSchedule(OS, OldSchedule, 4);
+    OS << "}\n\n";
 
-    outs() << "Schedule before flattening {\n";
-    printSchedule(outs(), OldSchedule, 4);
-    outs() << "}\n\n";
+    OS << "Schedule after flattening {\n";
+    printSchedule(OS, S.getSchedule(), 4);
+    OS << "}\n";
+  }
 
-    outs() << "Schedule after flattening {\n";
-    printSchedule(outs(), S.getSchedule(), 4);
-    outs() << "}\n";
+  void releaseMemory() override {
+    OldSchedule = {};
+    IslCtx.reset();
   }
+};
+
+char FlattenSchedule::ID;
+
+/// Print result from FlattenSchedule.
+class FlattenSchedulePrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  FlattenSchedulePrinterLegacyPass()
+      : FlattenSchedulePrinterLegacyPass(outs()) {}
+  explicit FlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    FlattenSchedule &P = getAnalysis<FlattenSchedule>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<FlattenSchedule>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char FlattenSchedulePrinterLegacyPass::ID = 0;
+} // anonymous namespace
+
+Pass *polly::createFlattenSchedulePass() { return new FlattenSchedule(); }
+
+Pass *polly::createFlattenSchedulePrinterLegacyPass(llvm::raw_ostream &OS) {
+  return new FlattenSchedulePrinterLegacyPass(OS);
 }
+
+INITIALIZE_PASS_BEGIN(FlattenSchedule, "polly-flatten-schedule",
+                      "Polly - Flatten schedule", false, false)
+INITIALIZE_PASS_END(FlattenSchedule, "polly-flatten-schedule",
+                    "Polly - Flatten schedule", false, false)
+
+INITIALIZE_PASS_BEGIN(FlattenSchedulePrinterLegacyPass,
+                      "polly-print-flatten-schedule",
+                      "Polly - Print flattened schedule", false, false)
+INITIALIZE_PASS_DEPENDENCY(FlattenSchedule)
+INITIALIZE_PASS_END(FlattenSchedulePrinterLegacyPass,
+                    "polly-print-flatten-schedule",
+                    "Polly - Print flattened schedule", false, false)
diff --git a/polly/lib/Transform/ForwardOpTree.cpp b/polly/lib/Transform/ForwardOpTree.cpp
index 24d4a4af6e681..e9be6c9cdcc27 100644
--- a/polly/lib/Transform/ForwardOpTree.cpp
+++ b/polly/lib/Transform/ForwardOpTree.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -61,11 +62,6 @@ static cl::opt<unsigned>
                     "analysis; 0=no limit"),
            cl::init(1000000), cl::cat(PollyCategory), cl::Hidden);
 
-static cl::opt<bool>
-    PollyPrintOptree("polly-print-optree",
-                     cl::desc("Polly - Print forward operand tree result"),
-                     cl::cat(PollyCategory));
-
 STATISTIC(KnownAnalyzed, "Number of successfully analyzed SCoPs");
 STATISTIC(KnownOutOfQuota,
           "Analyses aborted because max_operations was reached");
@@ -1034,8 +1030,8 @@ class ForwardOpTreeImpl final : ZoneAlgorithm {
   bool isModified() const { return Modified; }
 };
 
-static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTreeImpl(Scop &S,
-                                                               LoopInfo &LI) {
+static std::unique_ptr<ForwardOpTreeImpl> runForwardOpTree(Scop &S,
+                                                           LoopInfo &LI) {
   std::unique_ptr<ForwardOpTreeImpl> Impl;
   {
     IslMaxOperationsGuard MaxOpGuard(S.getIslCtx().get(), MaxOps, false);
@@ -1077,7 +1073,7 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                          raw_ostream *OS) {
   LoopInfo &LI = SAR.LI;
 
-  std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI);
+  std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTree(S, LI);
   if (OS) {
     *OS << "Printing analysis 'Polly - Forward operand tree' for region: '"
         << S.getName() << "' in function '" << S.getFunction().getName()
@@ -1098,8 +1094,99 @@ runForwardOpTreeUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   PA.preserveSet<AllAnalysesOn<Loop>>();
   return PA;
 }
+
+/// Pass that redirects scalar reads to array elements that are known to contain
+/// the same value.
+///
+/// This reduces the number of scalar accesses and therefore potentially
+/// increases the freedom of the scheduler. In the ideal case, all reads of a
+/// scalar definition are redirected (We currently do not care about removing
+/// the write in this case).  This is also useful for the main DeLICM pass as
+/// there are less scalars to be mapped.
+class ForwardOpTreeWrapperPass final : public ScopPass {
+private:
+  /// The pass implementation, also holding per-scop data.
+  std::unique_ptr<ForwardOpTreeImpl> Impl;
+
+public:
+  static char ID;
+
+  explicit ForwardOpTreeWrapperPass() : ScopPass(ID) {}
+  ForwardOpTreeWrapperPass(const ForwardOpTreeWrapperPass &) = delete;
+  ForwardOpTreeWrapperPass &
+  operator=(const ForwardOpTreeWrapperPass &) = delete;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredTransitive<ScopInfoRegionPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnScop(Scop &S) override {
+    // Free resources for previous SCoP's computation, if not yet done.
+    releaseMemory();
+
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    Impl = runForwardOpTree(S, LI);
+
+    return false;
+  }
+
+  void printScop(raw_ostream &OS, Scop &S) const override {
+    if (!Impl)
+      return;
+
+    assert(Impl->getScop() == &S);
+    Impl->print(OS);
+  }
+
+  void releaseMemory() override { Impl.reset(); }
+}; // class ForwardOpTree
+
+char ForwardOpTreeWrapperPass::ID;
+
+/// Print result from ForwardOpTreeWrapperPass.
+class ForwardOpTreePrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  ForwardOpTreePrinterLegacyPass() : ForwardOpTreePrinterLegacyPass(outs()) {}
+  explicit ForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    ForwardOpTreeWrapperPass &P = getAnalysis<ForwardOpTreeWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<ForwardOpTreeWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char ForwardOpTreePrinterLegacyPass::ID = 0;
 } // namespace
 
+Pass *polly::createForwardOpTreeWrapperPass() {
+  return new ForwardOpTreeWrapperPass();
+}
+
+Pass *polly::createForwardOpTreePrinterLegacyPass(llvm::raw_ostream &OS) {
+  return new ForwardOpTreePrinterLegacyPass(OS);
+}
+
 llvm::PreservedAnalyses ForwardOpTreePass::run(Scop &S,
                                                ScopAnalysisManager &SAM,
                                                ScopStandardAnalysisResults &SAR,
@@ -1113,20 +1200,14 @@ ForwardOpTreePrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return runForwardOpTreeUsingNPM(S, SAM, SAR, U, &OS);
 }
 
-bool polly::runForwardOpTree(Scop &S) {
-  LoopInfo &LI = *S.getLI();
-
-  std::unique_ptr<ForwardOpTreeImpl> Impl = runForwardOpTreeImpl(S, LI);
-  if (PollyPrintOptree) {
-    outs() << "Printing analysis 'Polly - Forward operand tree' for region: '"
-           << S.getName() << "' in function '" << S.getFunction().getName()
-           << "':\n";
-    if (Impl) {
-      assert(Impl->getScop() == &S);
-
-      Impl->print(outs());
-    }
-  }
-
-  return Impl->isModified();
-}
+INITIALIZE_PASS_BEGIN(ForwardOpTreeWrapperPass, "polly-optree",
+                      "Polly - Forward operand tree", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(ForwardOpTreeWrapperPass, "polly-optree",
+                    "Polly - Forward operand tree", false, false)
+
+INITIALIZE_PASS_BEGIN(ForwardOpTreePrinterLegacyPass, "polly-print-optree",
+                      "Polly - Print forward operand tree result", false, false)
+INITIALIZE_PASS_DEPENDENCY(ForwardOpTreeWrapperPass)
+INITIALIZE_PASS_END(ForwardOpTreePrinterLegacyPass, "polly-print-optree",
+                    "Polly - Print forward operand tree result", false, false)
diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp
index 62a4d251875c5..0719840f74a79 100644
--- a/polly/lib/Transform/MaximalStaticExpansion.cpp
+++ b/polly/lib/Transform/MaximalStaticExpansion.cpp
@@ -13,13 +13,14 @@
 
 #include "polly/MaximalStaticExpansion.h"
 #include "polly/DependenceInfo.h"
-#include "polly/Options.h"
+#include "polly/LinkAllPasses.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Support/ISLTools.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/InitializePasses.h"
 #include "isl/isl-noexceptions.h"
 #include "isl/union_map.h"
 #include <cassert>
@@ -34,10 +35,28 @@ using namespace polly;
 
 namespace {
 
-static cl::opt<bool>
-    PollyPrintMSE("polly-print-mse",
-                  cl::desc("Polly - Print Maximal static expansion of SCoP"),
-                  cl::cat(PollyCategory));
+class MaximalStaticExpanderWrapperPass final : public ScopPass {
+public:
+  static char ID;
+
+  explicit MaximalStaticExpanderWrapperPass() : ScopPass(ID) {}
+
+  ~MaximalStaticExpanderWrapperPass() override = default;
+
+  /// Expand the accesses of the SCoP.
+  ///
+  /// @param S The SCoP that must be expanded.
+  bool runOnScop(Scop &S) override;
+
+  /// Print the SCoP.
+  ///
+  /// @param OS The stream where to print.
+  /// @param S The SCop that must be printed.
+  void printScop(raw_ostream &OS, Scop &S) const override;
+
+  /// Register all analyses and transformations required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
 
 #ifndef NDEBUG
 /// Whether a dimension of a set is bounded (lower and upper) by a constant,
@@ -439,8 +458,8 @@ class MaximalStaticExpansionImpl {
 };
 
 static std::unique_ptr<MaximalStaticExpansionImpl>
-runMaximalStaticExpansionImpl(Scop &S, OptimizationRemarkEmitter &ORE,
-                              const Dependences &D) {
+runMaximalStaticExpansion(Scop &S, OptimizationRemarkEmitter &ORE,
+                          const Dependences &D) {
   auto Dependences = D.getDependences(Dependences::TYPE_RAW);
 
   std::unique_ptr<MaximalStaticExpansionImpl> Impl =
@@ -459,7 +478,7 @@ static PreservedAnalyses runMSEUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   auto &D = DI.getDependences(Dependences::AL_Reference);
 
   std::unique_ptr<MaximalStaticExpansionImpl> Impl =
-      runMaximalStaticExpansionImpl(S, ORE, D);
+      runMaximalStaticExpansion(S, ORE, D);
 
   if (OS) {
     *OS << "Printing analysis 'Polly - Maximal static expansion of SCoP' for "
@@ -492,24 +511,42 @@ MaximalStaticExpansionPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return runMSEUsingNPM(S, SAM, SAR, &OS);
 }
 
-void polly::runMaximalStaticExpansion(Scop &S, DependenceAnalysis::Result &DI) {
-  OptimizationRemarkEmitter ORE(&S.getFunction());
+char MaximalStaticExpanderWrapperPass::ID = 0;
+
+bool MaximalStaticExpanderWrapperPass::runOnScop(Scop &S) {
+  // Get the ORE from OptimizationRemarkEmitterWrapperPass.
+  OptimizationRemarkEmitter *ORE =
+      &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
+  // Get the RAW Dependences.
+  auto &DI = getAnalysis<DependenceInfo>();
   auto &D = DI.getDependences(Dependences::AL_Reference);
 
   std::unique_ptr<MaximalStaticExpansionImpl> Impl =
-      runMaximalStaticExpansionImpl(S, ORE, D);
+      runMaximalStaticExpansion(S, *ORE, D);
 
-  if (PollyPrintMSE) {
-    outs()
-        << "Printing analysis 'Polly - Maximal static expansion of SCoP' for "
-           "region: '"
-        << S.getName() << "' in function '" << S.getFunction().getName()
-        << "':\n";
+  return false;
+}
 
-    if (Impl) {
-      outs() << "MSE result:\n";
-      Impl->print(llvm::outs());
-    }
-  }
+void MaximalStaticExpanderWrapperPass::printScop(raw_ostream &OS,
+                                                 Scop &S) const {
+  S.print(OS, false);
 }
+
+void MaximalStaticExpanderWrapperPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  ScopPass::getAnalysisUsage(AU);
+  AU.addRequired<DependenceInfo>();
+  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+}
+
+Pass *polly::createMaximalStaticExpansionPass() {
+  return new MaximalStaticExpanderWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(MaximalStaticExpanderWrapperPass, "polly-mse",
+                      "Polly - Maximal static expansion of SCoP", false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
+INITIALIZE_PASS_END(MaximalStaticExpanderWrapperPass, "polly-mse",
+                    "Polly - Maximal static expansion of SCoP", false, false)
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 6acdd6862c4cc..0888ebd7a9362 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -57,6 +57,7 @@
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "isl/options.h"
 
@@ -197,10 +198,6 @@ static cl::opt<bool> OptimizedScops(
              "transformations is applied on the schedule tree"),
     cl::cat(PollyCategory));
 
-static cl::opt<bool> PollyPrintOptIsl("polly-print-opt-isl",
-                                      cl::desc("A polly pass"),
-                                      cl::cat(PollyCategory));
-
 STATISTIC(ScopsProcessed, "Number of scops processed");
 STATISTIC(ScopsRescheduled, "Number of scops rescheduled");
 STATISTIC(ScopsOptimized, "Number of scops optimized");
@@ -641,6 +638,34 @@ bool ScheduleTreeOptimizer::isProfitableSchedule(Scop &S,
   return changed;
 }
 
+class IslScheduleOptimizerWrapperPass final : public ScopPass {
+public:
+  static char ID;
+
+  explicit IslScheduleOptimizerWrapperPass() : ScopPass(ID) {}
+
+  /// Optimize the schedule of the SCoP @p S.
+  bool runOnScop(Scop &S) override;
+
+  /// Print the new schedule for the SCoP @p S.
+  void printScop(raw_ostream &OS, Scop &S) const override;
+
+  /// Register all analyses and transformation required.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Release the internal memory.
+  void releaseMemory() override {
+    LastSchedule = {};
+    IslCtx.reset();
+  }
+
+private:
+  std::shared_ptr<isl_ctx> IslCtx;
+  isl::schedule LastSchedule;
+};
+
+char IslScheduleOptimizerWrapperPass::ID = 0;
+
 #ifndef NDEBUG
 static void printSchedule(llvm::raw_ostream &OS, const isl::schedule &Schedule,
                           StringRef Desc) {
@@ -708,7 +733,7 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
       &Version);
 }
 
-static void runIslScheduleOptimizerImpl(
+static void runIslScheduleOptimizer(
     Scop &S,
     function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
     TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
@@ -925,6 +950,30 @@ static void runIslScheduleOptimizerImpl(
     errs() << S;
 }
 
+bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
+  releaseMemory();
+
+  Function &F = S.getFunction();
+  IslCtx = S.getSharedIslCtx();
+
+  auto getDependences =
+      [this](Dependences::AnalysisLevel) -> const Dependences & {
+    return getAnalysis<DependenceInfo>().getDependences(
+        Dependences::AL_Statement);
+  };
+  OptimizationRemarkEmitter &ORE =
+      getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  TargetTransformInfo *TTI =
+      &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool DepsChanged = false;
+  runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
+                          DepsChanged);
+  if (DepsChanged)
+    getAnalysis<DependenceInfo>().abandonDependences();
+  return false;
+}
+
 static void runScheduleOptimizerPrinter(raw_ostream &OS,
                                         isl::schedule LastSchedule) {
   isl_printer *p;
@@ -948,8 +997,36 @@ static void runScheduleOptimizerPrinter(raw_ostream &OS,
   free(ScheduleStr);
 }
 
+void IslScheduleOptimizerWrapperPass::printScop(raw_ostream &OS, Scop &) const {
+  runScheduleOptimizerPrinter(OS, LastSchedule);
+}
+
+void IslScheduleOptimizerWrapperPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  ScopPass::getAnalysisUsage(AU);
+  AU.addRequired<DependenceInfo>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+
+  AU.addPreserved<DependenceInfo>();
+  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+}
+
 } // namespace
 
+Pass *polly::createIslScheduleOptimizerWrapperPass() {
+  return new IslScheduleOptimizerWrapperPass();
+}
+
+INITIALIZE_PASS_BEGIN(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
+                      "Polly - Optimize schedule of SCoP", false, false);
+INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
+INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass);
+INITIALIZE_PASS_END(IslScheduleOptimizerWrapperPass, "polly-opt-isl",
+                    "Polly - Optimize schedule of SCoP", false, false)
+
 static llvm::PreservedAnalyses
 runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                                 ScopStandardAnalysisResults &SAR, SPMUpdater &U,
@@ -962,7 +1039,7 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
   TargetTransformInfo *TTI = &SAR.TTI;
   isl::schedule LastSchedule;
   bool DepsChanged = false;
-  runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
+  runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
   if (DepsChanged)
     Deps.abandonDependences();
 
@@ -988,23 +1065,52 @@ IslScheduleOptimizerPrinterPass::run(Scop &S, ScopAnalysisManager &SAM,
   return runIslScheduleOptimizerUsingNPM(S, SAM, SAR, U, &OS);
 }
 
-void polly::runIslScheduleOptimizer(Scop &S, TargetTransformInfo *TTI,
-                                    DependenceAnalysis::Result &Deps) {
-  auto GetDeps = [&Deps](Dependences::AnalysisLevel) -> const Dependences & {
-    return Deps.getDependences(Dependences::AL_Statement);
-  };
-  OptimizationRemarkEmitter ORE(&S.getFunction());
-  isl::schedule LastSchedule;
-  bool DepsChanged = false;
-  runIslScheduleOptimizerImpl(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
-  if (DepsChanged)
-    Deps.abandonDependences();
+//===----------------------------------------------------------------------===//
 
-  if (PollyPrintOptIsl) {
-    outs()
-        << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
-        << S.getName() << "' in function '" << S.getFunction().getName()
-        << "':\n";
-    runScheduleOptimizerPrinter(outs(), LastSchedule);
+namespace {
+/// Print result from IslScheduleOptimizerWrapperPass.
+class IslScheduleOptimizerPrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  IslScheduleOptimizerPrinterLegacyPass()
+      : IslScheduleOptimizerPrinterLegacyPass(outs()) {}
+  explicit IslScheduleOptimizerPrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    IslScheduleOptimizerWrapperPass &P =
+        getAnalysis<IslScheduleOptimizerWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<IslScheduleOptimizerWrapperPass>();
+    AU.setPreservesAll();
   }
+
+private:
+  llvm::raw_ostream &OS;
+};
+
+char IslScheduleOptimizerPrinterLegacyPass::ID = 0;
+} // namespace
+
+Pass *polly::createIslScheduleOptimizerPrinterLegacyPass(raw_ostream &OS) {
+  return new IslScheduleOptimizerPrinterLegacyPass(OS);
 }
+
+INITIALIZE_PASS_BEGIN(IslScheduleOptimizerPrinterLegacyPass,
+                      "polly-print-opt-isl",
+                      "Polly - Print optimizer schedule of SCoP", false, false);
+INITIALIZE_PASS_DEPENDENCY(IslScheduleOptimizerWrapperPass)
+INITIALIZE_PASS_END(IslScheduleOptimizerPrinterLegacyPass,
+                    "polly-print-opt-isl",
+                    "Polly - Print optimizer schedule of SCoP", false, false)
diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp
index 8e7a0dedaf533..c04ba3498339e 100644
--- a/polly/lib/Transform/ScopInliner.cpp
+++ b/polly/lib/Transform/ScopInliner.cpp
@@ -95,7 +95,53 @@ template <typename SCC_t> bool runScopInlinerImpl(Function *F, SCC_t &SCC) {
 
   return Changed;
 }
+
+class ScopInlinerWrapperPass final : public CallGraphSCCPass {
+  using llvm::Pass::doInitialization;
+
+public:
+  static char ID;
+
+  ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {}
+
+  bool doInitialization(CallGraph &CG) override {
+    if (!polly::PollyAllowFullFunction) {
+      report_fatal_error(
+          "Aborting from ScopInliner because it only makes sense to run with "
+          "-polly-allow-full-function. "
+          "The heurtistic for ScopInliner checks that the full function is a "
+          "Scop, which happens if and only if polly-allow-full-function is "
+          " enabled. "
+          " If not, the entry block is not included in the Scop");
+    }
+    return true;
+  }
+
+  bool runOnSCC(CallGraphSCC &SCC) override {
+    Function *F = (*SCC.begin())->getFunction();
+    return runScopInlinerImpl(F, SCC);
+  };
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+};
 } // namespace
+char ScopInlinerWrapperPass::ID;
+
+Pass *polly::createScopInlinerWrapperPass() {
+  ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass();
+  return pass;
+}
+
+INITIALIZE_PASS_BEGIN(
+    ScopInlinerWrapperPass, "polly-scop-inliner",
+    "inline functions based on how much of the function is a scop.", false,
+    false)
+INITIALIZE_PASS_END(
+    ScopInlinerWrapperPass, "polly-scop-inliner",
+    "inline functions based on how much of the function is a scop.", false,
+    false)
 
 polly::ScopInlinerPass::ScopInlinerPass() {
   if (!polly::PollyAllowFullFunction) {
diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp
index cf0f8c5ca5ef2..75e91cd1c031a 100644
--- a/polly/lib/Transform/Simplify.cpp
+++ b/polly/lib/Transform/Simplify.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "polly/Simplify.h"
-#include "polly/Options.h"
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Support/GICHelper.h"
@@ -19,6 +18,7 @@
 #include "polly/Support/ISLTools.h"
 #include "polly/Support/VirtualInstruction.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include <optional>
 
@@ -30,11 +30,6 @@ using namespace polly;
 
 namespace {
 
-static cl::opt<bool>
-    PollyPrintSimplify("polly-print-simplify",
-                       cl::desc("Polly - Print Simplify actions"),
-                       cl::cat(PollyCategory));
-
 #define TWO_STATISTICS(VARNAME, DESC)                                          \
   static llvm::Statistic VARNAME[2] = {                                        \
       {DEBUG_TYPE, #VARNAME "0", DESC " (first)"},                             \
@@ -761,6 +756,39 @@ void SimplifyImpl::printScop(raw_ostream &OS, Scop &S) const {
   printAccesses(OS);
 }
 
+class SimplifyWrapperPass final : public ScopPass {
+public:
+  static char ID;
+  int CallNo;
+  std::optional<SimplifyImpl> Impl;
+
+  explicit SimplifyWrapperPass(int CallNo = 0) : ScopPass(ID), CallNo(CallNo) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredTransitive<ScopInfoRegionPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+  bool runOnScop(Scop &S) override {
+    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+    Impl.emplace(CallNo);
+    Impl->run(S, LI);
+
+    return false;
+  }
+
+  void printScop(raw_ostream &OS, Scop &S) const override {
+    if (Impl)
+      Impl->printScop(OS, S);
+  }
+
+  void releaseMemory() override { Impl.reset(); }
+};
+
+char SimplifyWrapperPass::ID;
+
 static llvm::PreservedAnalyses
 runSimplifyUsingNPM(Scop &S, ScopAnalysisManager &SAM,
                     ScopStandardAnalysisResults &SAR, SPMUpdater &U, int CallNo,
@@ -815,15 +843,58 @@ SmallVector<MemoryAccess *, 32> polly::getAccessesInOrder(ScopStmt &Stmt) {
   return Accesses;
 }
 
-bool polly::runSimplify(Scop &S, int CallNo) {
-  SimplifyImpl Impl(CallNo);
-  Impl.run(S, S.getLI());
-  if (PollyPrintSimplify) {
-    outs() << "Printing analysis 'Polly - Simplify' for region: '"
-           << S.getName() << "' in function '" << S.getFunction().getName()
-           << "':\n";
-    Impl.printScop(outs(), S);
+Pass *polly::createSimplifyWrapperPass(int CallNo) {
+  return new SimplifyWrapperPass(CallNo);
+}
+
+INITIALIZE_PASS_BEGIN(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(SimplifyWrapperPass, "polly-simplify", "Polly - Simplify",
+                    false, false)
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Print result from SimplifyWrapperPass.
+class SimplifyPrinterLegacyPass final : public ScopPass {
+public:
+  static char ID;
+
+  SimplifyPrinterLegacyPass() : SimplifyPrinterLegacyPass(outs()) {}
+  explicit SimplifyPrinterLegacyPass(llvm::raw_ostream &OS)
+      : ScopPass(ID), OS(OS) {}
+
+  bool runOnScop(Scop &S) override {
+    SimplifyWrapperPass &P = getAnalysis<SimplifyWrapperPass>();
+
+    OS << "Printing analysis '" << P.getPassName() << "' for region: '"
+       << S.getRegion().getNameStr() << "' in function '"
+       << S.getFunction().getName() << "':\n";
+    P.printScop(OS, S);
+
+    return false;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ScopPass::getAnalysisUsage(AU);
+    AU.addRequired<SimplifyWrapperPass>();
+    AU.setPreservesAll();
   }
 
-  return Impl.isModified();
+private:
+  llvm::raw_ostream &OS;
+};
+
+char SimplifyPrinterLegacyPass::ID = 0;
+} // namespace
+
+Pass *polly::createSimplifyPrinterLegacyPass(raw_ostream &OS) {
+  return new SimplifyPrinterLegacyPass(OS);
 }
+
+INITIALIZE_PASS_BEGIN(SimplifyPrinterLegacyPass, "polly-print-simplify",
+                      "Polly - Print Simplify actions", false, false)
+INITIALIZE_PASS_DEPENDENCY(SimplifyWrapperPass)
+INITIALIZE_PASS_END(SimplifyPrinterLegacyPass, "polly-print-simplify",
+                    "Polly - Print Simplify actions", false, false)
diff --git a/polly/test/CodeGen/20100617.ll b/polly/test/CodeGen/20100617.ll
index 7de1b843a5b0a..7229a6e3d5240 100644
--- a/polly/test/CodeGen/20100617.ll
+++ b/polly/test/CodeGen/20100617.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100622.ll b/polly/test/CodeGen/20100622.ll
index 13a6159d3e7a7..bed737741abba 100644
--- a/polly/test/CodeGen/20100622.ll
+++ b/polly/test/CodeGen/20100622.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s | not FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 
diff --git a/polly/test/CodeGen/20100707.ll b/polly/test/CodeGen/20100707.ll
index 6a4763dcb3b76..ee0422e07c4ea 100644
--- a/polly/test/CodeGen/20100707.ll
+++ b/polly/test/CodeGen/20100707.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @clause_SetSplitField(i32 %Length) nounwind inlinehint {
diff --git a/polly/test/CodeGen/20100707_2.ll b/polly/test/CodeGen/20100707_2.ll
index 648a06479ae27..a4cd76af9dd3c 100644
--- a/polly/test/CodeGen/20100707_2.ll
+++ b/polly/test/CodeGen/20100707_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @win193 = external global [4 x [36 x double]], align 32 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/20100708.ll b/polly/test/CodeGen/20100708.ll
index 52153d7cfa730..9080451aeae50 100644
--- a/polly/test/CodeGen/20100708.ll
+++ b/polly/test/CodeGen/20100708.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @execute() nounwind {
diff --git a/polly/test/CodeGen/20100708_2.ll b/polly/test/CodeGen/20100708_2.ll
index 075a4947c8e72..51dc9d311f070 100644
--- a/polly/test/CodeGen/20100708_2.ll
+++ b/polly/test/CodeGen/20100708_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100713.ll b/polly/test/CodeGen/20100713.ll
index 0b0ed7327c8b1..a836795c9907f 100644
--- a/polly/test/CodeGen/20100713.ll
+++ b/polly/test/CodeGen/20100713.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @fft_float(i32 %NumSamples) nounwind {
diff --git a/polly/test/CodeGen/20100713_2.ll b/polly/test/CodeGen/20100713_2.ll
index 5681f34152342..28b984bd5900f 100644
--- a/polly/test/CodeGen/20100713_2.ll
+++ b/polly/test/CodeGen/20100713_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define hidden void @luaD_callhook() nounwind {
diff --git a/polly/test/CodeGen/20100717.ll b/polly/test/CodeGen/20100717.ll
index 97ed151410dfb..51c453cfe438e 100644
--- a/polly/test/CodeGen/20100717.ll
+++ b/polly/test/CodeGen/20100717.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @matrixTranspose(ptr %A) nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo-2.ll b/polly/test/CodeGen/20100718-DomInfo-2.ll
index cbee80e44949c..fdac75f1b999f 100644
--- a/polly/test/CodeGen/20100718-DomInfo-2.ll
+++ b/polly/test/CodeGen/20100718-DomInfo-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @getNonAffNeighbour() nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo.ll b/polly/test/CodeGen/20100718-DomInfo.ll
index e6fcaf6a9272f..da68eb0dd8fa7 100644
--- a/polly/test/CodeGen/20100718-DomInfo.ll
+++ b/polly/test/CodeGen/20100718-DomInfo.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @intrapred_luma_16x16(i32 %predmode) nounwind {
diff --git a/polly/test/CodeGen/20100720-MultipleConditions.ll b/polly/test/CodeGen/20100720-MultipleConditions.ll
index 66c9e2bb0eb5b..3dece4efdcd06 100644
--- a/polly/test/CodeGen/20100720-MultipleConditions.ll
+++ b/polly/test/CodeGen/20100720-MultipleConditions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s
 
 ;int bar1();
 ;int bar2();
diff --git a/polly/test/CodeGen/20100809-IndependentBlock.ll b/polly/test/CodeGen/20100809-IndependentBlock.ll
index cc3a5087090b4..f45b6544464de 100644
--- a/polly/test/CodeGen/20100809-IndependentBlock.ll
+++ b/polly/test/CodeGen/20100809-IndependentBlock.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @cfft2(ptr %x) nounwind {
 entry:
diff --git a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
index 240c2a49bc46d..82da9d2486423 100644
--- a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
+++ b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20101030-Overflow.ll b/polly/test/CodeGen/20101030-Overflow.ll
index c199f757ebac5..fecdb9d4fed1e 100644
--- a/polly/test/CodeGen/20101030-Overflow.ll
+++ b/polly/test/CodeGen/20101030-Overflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @compdecomp() nounwind {
diff --git a/polly/test/CodeGen/20101103-Overflow3.ll b/polly/test/CodeGen/20101103-Overflow3.ll
index e8b425f009723..f1503e25fcc4c 100644
--- a/polly/test/CodeGen/20101103-Overflow3.ll
+++ b/polly/test/CodeGen/20101103-Overflow3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @Reflection_coefficients(ptr %r) nounwind {
 bb20:
diff --git a/polly/test/CodeGen/20101103-signmissmatch.ll b/polly/test/CodeGen/20101103-signmissmatch.ll
index 0295ee0567208..3d0c929446f45 100644
--- a/polly/test/CodeGen/20101103-signmissmatch.ll
+++ b/polly/test/CodeGen/20101103-signmissmatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @CleanNet() nounwind {
diff --git a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
index 6913deed23054..0e62e678f0ae2 100644
--- a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
+++ b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @main() nounwind {
diff --git a/polly/test/CodeGen/20110226-PHI-Node-removed.ll b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
index a39fced9dbaba..32b018f24e547 100644
--- a/polly/test/CodeGen/20110226-PHI-Node-removed.ll
+++ b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20120316-InvalidCast.ll b/polly/test/CodeGen/20120316-InvalidCast.ll
index a7f709b4a7615..b87a3dc60deaa 100644
--- a/polly/test/CodeGen/20120316-InvalidCast.ll
+++ b/polly/test/CodeGen/20120316-InvalidCast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
index 554384c0e777e..dac78bf04a250 100644
--- a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
+++ b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 ; We just check that this compilation does not crash.
 
diff --git a/polly/test/CodeGen/20130221.ll b/polly/test/CodeGen/20130221.ll
index 101930e175634..5728a768a3b3b 100644
--- a/polly/test/CodeGen/20130221.ll
+++ b/polly/test/CodeGen/20130221.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @list_sequence(ptr %A) {
diff --git a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
index 7ad8cbf963f45..cafd68e508255 100644
--- a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
+++ b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/Intrinsics/llvm-expect.ll b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
index ba4ea1565e481..47fd4f07e4678 100644
--- a/polly/test/CodeGen/Intrinsics/llvm-expect.ll
+++ b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we generate code without crashing.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
index a92917f30b724..eb7de01ba862c 100644
--- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
+++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
@@ -1,6 +1,6 @@
 ; This test checks that we do not accidentally mutate the debug info when
 ; inserting loop parallel metadata.
-; RUN: opt %loadNPMPolly -S -polly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel < %s | FileCheck %s
+; RUN: opt %loadNPMPolly < %s  -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s
 ; CHECK-NOT: !7 = !{!7}
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
index 0d947004aea50..9bb086fa79aed 100644
--- a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s
 ;
 ; Check that we mark multiple parallel loops correctly including the memory instructions.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
index 1293cd91da78d..442600cff7a0a 100644
--- a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; This is a trivially parallel loop. We just use it to ensure that we actually
diff --git a/polly/test/CodeGen/MemAccess/bad_alignment.ll b/polly/test/CodeGen/MemAccess/bad_alignment.ll
index be1c64938422c..82fff27dd0eb7 100644
--- a/polly/test/CodeGen/MemAccess/bad_alignment.ll
+++ b/polly/test/CodeGen/MemAccess/bad_alignment.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -disable-output 2>&1 < %s | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s
 ;
 ; Check that we do not allow to access elements not accessed before because the
 ; alignment information would become invalid.
diff --git a/polly/test/CodeGen/MemAccess/codegen_address_space.ll b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
index 283c8fbd2c249..3360e10529f8e 100644
--- a/polly/test/CodeGen/MemAccess/codegen_address_space.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
index ce44f2daceaa9..0563ca87eef51 100644
--- a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple.ll b/polly/test/CodeGen/MemAccess/codegen_simple.ll
index ab1dca516a9cf..ee0187fe97d25 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
index 72f9c2ce61e3c..6970565bf023e 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 ;
 ;float A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
index a6d9969286fc7..f0896e2bf6093 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
 
 ;int A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
index 568b0ff4ae20a..99fc36996f083 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withconst -S < %s | FileCheck -check-prefix=WITHCONST %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed+withoutconst -S < %s | FileCheck -check-prefix=WITHOUTCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
 ;
 ;float A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/create_arrays.ll b/polly/test/CodeGen/MemAccess/create_arrays.ll
index 8443e0f7be327..40ae8d6efa95f 100644
--- a/polly/test/CodeGen/MemAccess/create_arrays.ll
+++ b/polly/test/CodeGen/MemAccess/create_arrays.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadPolly -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;  for (i = 0; i < _PB_NI; i++)
 ;    for (j = 0; j < _PB_NJ; j++)
diff --git a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll
index 9c95378a76433..1202d21998c94 100644
--- a/polly/test/CodeGen/MemAccess/create_arrays_heap.ll
+++ b/polly/test/CodeGen/MemAccess/create_arrays_heap.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-scops '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; #define Ni 1056
 ; #define Nj 1056
diff --git a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll
index f08fabd67ef5c..7d8083cc55846 100644
--- a/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll
+++ b/polly/test/CodeGen/MemAccess/default_aligned_new_access_function.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-import-jscop -disable-output < %s | FileCheck %s
 ;
 ; Check that we allow the new access functions even though they access
 ; different locations than the original ones (but the alignment is the
diff --git a/polly/test/CodeGen/MemAccess/different_types.ll b/polly/test/CodeGen/MemAccess/different_types.ll
index ae6168d235a96..407e72702aa86 100644
--- a/polly/test/CodeGen/MemAccess/different_types.ll
+++ b/polly/test/CodeGen/MemAccess/different_types.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: \
+; RUN: -S < %s | FileCheck %s
 ;
 ;    void foo(float A[], float B[]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/MemAccess/generate-all.ll b/polly/test/CodeGen/MemAccess/generate-all.ll
index 099a3e0670960..7b2286bfc95a9 100644
--- a/polly/test/CodeGen/MemAccess/generate-all.ll
+++ b/polly/test/CodeGen/MemAccess/generate-all.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=false -S < %s | FileCheck %s -check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-generate-expressions=true -S < %s | FileCheck %s -check-prefix=ASTEXPR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=false \
+; RUN:     -S < %s | FileCheck %s -check-prefix=SCEV
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=true \
+; RUN:     -S < %s | FileCheck %s -check-prefix=ASTEXPR
 ;
 ;    void foo(float A[]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
index d8d0df7009685..5c926ac638413 100644
--- a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
+++ b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-invariant-load-hoisting -S 2>&1 < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:   -polly-invariant-load-hoisting -S \
+; RUN:   2>&1 < %s | FileCheck %s
 
 ; Setting new access functions where the base pointer of the array that is newly
 ; accessed is only loaded within the scop itself caused incorrect code to be
diff --git a/polly/test/CodeGen/MemAccess/map_scalar_access.ll b/polly/test/CodeGen/MemAccess/map_scalar_access.ll
index 4ea21b26ce531..7c845d4a004f4 100644
--- a/polly/test/CodeGen/MemAccess/map_scalar_access.ll
+++ b/polly/test/CodeGen/MemAccess/map_scalar_access.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-print-import-jscop -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop-postfix=transformed -polly-import-jscop -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
 
 define void @map_scalar_access(ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/CodeGen/MemAccess/multiple_types.ll b/polly/test/CodeGen/MemAccess/multiple_types.ll
index edc3888be364b..7848977ce0310 100644
--- a/polly/test/CodeGen/MemAccess/multiple_types.ll
+++ b/polly/test/CodeGen/MemAccess/multiple_types.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-allow-differing-element-types -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-allow-differing-element-types \
+; RUN:   -S    < %s | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/CodeGen/MemAccess/simple.ll b/polly/test/CodeGen/MemAccess/simple.ll
index 63d66f1c925f7..5077e1a1b5a2c 100644
--- a/polly/test/CodeGen/MemAccess/simple.ll
+++ b/polly/test/CodeGen/MemAccess/simple.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -stats < %s 2>&1 | FileCheck %s
+;RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1  | FileCheck %s
 ; REQUIRES: asserts
 
 ;int A[100];
diff --git a/polly/test/CodeGen/MemAccess/simple_analyze.ll b/polly/test/CodeGen/MemAccess/simple_analyze.ll
index f07cb1629ca18..143651b565aff 100644
--- a/polly/test/CodeGen/MemAccess/simple_analyze.ll
+++ b/polly/test/CodeGen/MemAccess/simple_analyze.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+;RUN: opt %loadPolly -polly-print-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 @A = common global [100 x i32] zeroinitializer, align 4
diff --git a/polly/test/CodeGen/MemAccess/update_access_functions.ll b/polly/test/CodeGen/MemAccess/update_access_functions.ll
index 93f5f186ad6a5..51fa97adb3c37 100644
--- a/polly/test/CodeGen/MemAccess/update_access_functions.ll
+++ b/polly/test/CodeGen/MemAccess/update_access_functions.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:                -polly-import-jscop-postfix=transformed \
+; RUN:                 < %s -S | FileCheck %s
 
 ; CHECK-LABEL: polly.stmt.loop1:
 ; CHECK-NEXT:   %3 = mul nsw i64 5, %polly.indvar{{[0-9]*}}
diff --git a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll
index 344a6d0990837..ebe91636ea3cc 100644
--- a/polly/test/CodeGen/Metadata/basic_vec_annotate.ll
+++ b/polly/test/CodeGen/Metadata/basic_vec_annotate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-annotate-metadata-vectorize < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-annotate-metadata-vectorize < %s | FileCheck %s
 
 ; Basic verification of vectorize metadata getting added when "-polly-vectorize-metadata" is
 ; passed.
diff --git a/polly/test/CodeGen/OpenMP/alias-metadata.ll b/polly/test/CodeGen/OpenMP/alias-metadata.ll
index 541fbdda5a6b9..121f630789892 100644
--- a/polly/test/CodeGen/OpenMP/alias-metadata.ll
+++ b/polly/test/CodeGen/OpenMP/alias-metadata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
 ;
 ;    void foo(float *A, float *B) {
 ;      for (long i = 0; i < 1000; i++)
diff --git a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
index 657921690c74d..7177ae01f0754 100644
--- a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
+++ b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that we do not crash but generate parallel code
 ;
diff --git a/polly/test/CodeGen/OpenMP/inlineasm.ll b/polly/test/CodeGen/OpenMP/inlineasm.ll
index ac6c7070c1abf..82a73780886e3 100644
--- a/polly/test/CodeGen/OpenMP/inlineasm.ll
+++ b/polly/test/CodeGen/OpenMP/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,polly-codegen' -polly-parallel -S < %s | FileCheck %s
 ; llvm.org/PR51960
 
 ; CHECK-LABEL: define internal void @foo_polly_subfn
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
index 08c0cc7fe37f2..aba3ae78f7783 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
index 8246aaa25b7b2..8cf6148a7b44c 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
index 0c5208c77768b..823e5cab55ab3 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction but
 ; not B[0] as it is not needed
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
index fd039e75444b5..5557839e715ed 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
index fe8b8a3a022bc..a987fac31b743 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This code has failed the scev based code generation as the scev in the scop
 ; contains an AddRecExpr of an outer loop. When generating code, we did not
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
index d1f48d92e0e75..96c6d900a7a00 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; AST: #pragma simd
 ; AST: #pragma omp parallel for
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
index 5b032801c7282..c4ad665c7b6cf 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; The interesting part of this test case is the instruction:
 ;   %tmp = bitcast i8* %call to i64**
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
index d612faf7b67c5..82acba8b3c523 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
 
 ; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
index 213cc2635fb6d..aa44658131bba 100644
--- a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
+++ b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
 ;
 ; float A[100];
 ;
diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
index fef23f141eaeb..4deab1af0ccf0 100644
--- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
+++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly<no-default-opts;delicm>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly-delicm,polly-codegen' -S < %s | FileCheck %s
 ;
 ; Verify that -polly-parallel can handle mapped scalar MemoryAccesses.
 ;
diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
index fd8ce87b45ae8..43326b29f7ef1 100644
--- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll
+++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
-; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck --check-prefix=CODEGEN %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,print<polly-ast>' -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,polly-codegen' -S < %s | FileCheck --check-prefix=CODEGEN %s
 ; REQUIRES: asserts
 
 ; Parallelization of detected matrix-multiplication.
diff --git a/polly/test/CodeGen/OpenMP/new_multidim_access.ll b/polly/test/CodeGen/OpenMP/new_multidim_access.ll
index 8018acdcb0e6a..5faabb4d20c1a 100644
--- a/polly/test/CodeGen/OpenMP/new_multidim_access.ll
+++ b/polly/test/CodeGen/OpenMP/new_multidim_access.ll
@@ -1,6 +1,10 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-print-import-jscop -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-import-jscop \
+; RUN:                -disable-output < %s | FileCheck %s
 
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S -polly-parallel < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadPolly -polly-import-jscop \
+; RUN:                -polly-codegen -S < %s \
+; RUN:                -polly-parallel \
+; RUN:                | FileCheck %s -check-prefix=IR
 
 ;    void new_multidim_access(long n, long m, float A[][m]) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/CodeGen/OpenMP/recomputed-srem.ll b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
index 99069612cd1d4..b7b3a44610f32 100644
--- a/polly/test/CodeGen/OpenMP/recomputed-srem.ll
+++ b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-parallel -polly-parallel-force -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -polly-parallel \
+; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we pass %rem96 to the parallel subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
index 236362a3e23dc..c207f589e4da0 100644
--- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
+++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
@@ -1,8 +1,17 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-scheduling=runtime -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
-
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen \
+; RUN: -S -verify-dom-info < %s \
+; RUN: | FileCheck %s -check-prefix=IR
+
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-scheduling=runtime \
+; RUN: -S -verify-dom-info < %s \
+; RUN: | FileCheck %s -check-prefix=IR
+
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
+; RUN: -S -verify-dom-info < %s \
+; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
 
 ; IR: @GOMP_parallel_loop_runtime_start
 
diff --git a/polly/test/CodeGen/OpenMP/reference-other-bb.ll b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
index 9925187883173..dbfbd9a905086 100644
--- a/polly/test/CodeGen/OpenMP/reference-other-bb.ll
+++ b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; IR: @foo_polly_subfn
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
index 3738266b558ed..ee43b8aa34a44 100644
--- a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
+++ b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 
 ; - Test the case where scalar evolution references a loop that is outside
diff --git a/polly/test/CodeGen/OpenMP/reference_latest.ll b/polly/test/CodeGen/OpenMP/reference_latest.ll
index fb420b06b9afb..7a8cd77bb1571 100644
--- a/polly/test/CodeGen/OpenMP/reference_latest.ll
+++ b/polly/test/CodeGen/OpenMP/reference_latest.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;delicm;simplify>' -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-codegen' -polly-parallel -S < %s | FileCheck %s
 ;
 ; Test that parallel codegen handles scalars mapped to other arrays.
 ; After mapping "store double %add10" references the array "MemRef2".
diff --git a/polly/test/CodeGen/OpenMP/scev-rewriting.ll b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
index 861a78e4acd7a..9b79f29094482 100644
--- a/polly/test/CodeGen/OpenMP/scev-rewriting.ll
+++ b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -passes=polly-codegen -S | FileCheck %s
 ; CHECK: define internal void @DoStringSort_polly_subfn
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnueabi"
diff --git a/polly/test/CodeGen/OpenMP/single_loop.ll b/polly/test/CodeGen/OpenMP/single_loop.ll
index 5e8a58fadd56c..e5aee840ade74 100644
--- a/polly/test/CodeGen/OpenMP/single_loop.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop.ll
@@ -1,14 +1,14 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
 
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<import-jscop;codegen>' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
 
 ; This extensive test case tests the creation of the full set of OpenMP calls
 ; as well as the subfunction creation using a trivial loop as example.
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
index 95324793f4fa4..c519bfdee7a58 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; #define N 1024
 ; float A[N];
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
index 7334762f84f6c..f6dfd62d6bcc1 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
@@ -1,8 +1,18 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen \
+; RUN: -S -verify-dom-info < %s \
+; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
+; RUN: -S -verify-dom-info < %s \
+; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
 
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-STATIC-IR
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
+; RUN: -polly-scheduling=static \
+; RUN: -S -verify-dom-info < %s \
+; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR
 
 ; Ensure the scalars are initialized before the OpenMP code is launched.
 ;
diff --git a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
index 77c1b23a3f76c..934e04461f134 100644
--- a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
+++ b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This test case verifies that we create correct code even if two OpenMP loops
 ; share common outer variables.
diff --git a/polly/test/CodeGen/PHIInExit.ll b/polly/test/CodeGen/PHIInExit.ll
index 39bdac793e8a1..3e0c9d67d5ca8 100644
--- a/polly/test/CodeGen/PHIInExit.ll
+++ b/polly/test/CodeGen/PHIInExit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 %struct..0__pthread_mutex_s = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_list_t }
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
index 9ec9804d35b0d..ccb0d15cfc3d2 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-codegen-add-debug-printing -polly-ignore-aliasing < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-codegen-add-debug-printing \
+; RUN: -polly-ignore-aliasing < %s | FileCheck %s
 
 ;    #define N 10
 ;    void foo(float A[restrict], double B[restrict], char C[restrict],
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
index 736c136eeb67c..4ffb7fd6e4621 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -passes=polly-codegen -S < %s | FileCheck %s
 ;
 
 define void @func(i32 %n, ptr %A) {
diff --git a/polly/test/CodeGen/alias-check-multi-dim.ll b/polly/test/CodeGen/alias-check-multi-dim.ll
index bab2690bddb17..0440bda74b391 100644
--- a/polly/test/CodeGen/alias-check-multi-dim.ll
+++ b/polly/test/CodeGen/alias-check-multi-dim.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: sext i32 %indvar.init to i64
diff --git a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
index 37ec2d5b748af..4186b8521a535 100644
--- a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
+++ b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -S < %s \
+; RUN:   | FileCheck %s
 ;
 ;    void manyarrays(float A1[], float A2[], float A3[], float A4[], float A5[],
 ;                    float A6[], float A7[], float A8[], float A9[]) {
diff --git a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
index 7fed270cb51dd..8e1fc3b328355 100644
--- a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
+++ b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; We have to cast %B to "short *" before we create RTCs.
 ;
diff --git a/polly/test/CodeGen/aliasing_different_pointer_types.ll b/polly/test/CodeGen/aliasing_different_pointer_types.ll
index 5326af339ddac..e601c22b978da 100644
--- a/polly/test/CodeGen/aliasing_different_pointer_types.ll
+++ b/polly/test/CodeGen/aliasing_different_pointer_types.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that we cast the different pointer types correctly before we compare
 ; them in the RTC's. We use i8* as max pointer type.
diff --git a/polly/test/CodeGen/aliasing_multidimensional_access.ll b/polly/test/CodeGen/aliasing_multidimensional_access.ll
index 5d0b40d6b59aa..e1dae03280a0e 100644
--- a/polly/test/CodeGen/aliasing_multidimensional_access.ll
+++ b/polly/test/CodeGen/aliasing_multidimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we calculate the maximal access into array A correctly and track the overflow state.
 ;
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_1.ll b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
index 1b7b85835d795..a79ba2532535d 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_1.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_2.ll b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
index fa8053ccabbea..efe4af1c9e7c5 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_2.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_struct_element.ll b/polly/test/CodeGen/aliasing_struct_element.ll
index 4e8570944f6c6..3079e58d7daba 100644
--- a/polly/test/CodeGen/aliasing_struct_element.ll
+++ b/polly/test/CodeGen/aliasing_struct_element.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; We should only access (or compute the address of) "the first element" of %S
 ; as it is a single struct not a struct array. The maximal access to S, thus
diff --git a/polly/test/CodeGen/alignment.ll b/polly/test/CodeGen/alignment.ll
index daf7999c8072b..e0f6a959476f6 100644
--- a/polly/test/CodeGen/alignment.ll
+++ b/polly/test/CodeGen/alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that the special alignment information is kept
 ;
diff --git a/polly/test/CodeGen/annotated_alias_scopes.ll b/polly/test/CodeGen/annotated_alias_scopes.ll
index 7d2d9038270a9..ada03e0663722 100644
--- a/polly/test/CodeGen/annotated_alias_scopes.ll
+++ b/polly/test/CodeGen/annotated_alias_scopes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=SCOPES
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES
 ;
 ; Check that we create alias scopes that indicate the accesses to A, B and C cannot alias in any way.
 ;
diff --git a/polly/test/CodeGen/blas_sscal_simplified.ll b/polly/test/CodeGen/blas_sscal_simplified.ll
index 461af09b5b289..99f2eae9dd8e5 100644
--- a/polly/test/CodeGen/blas_sscal_simplified.ll
+++ b/polly/test/CodeGen/blas_sscal_simplified.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Regression test for a bug in the runtime check generation.
 
diff --git a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
index 5eb6076892f3e..5dba93373b70b 100644
--- a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
+++ b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -disable-output < %s
 ;
 ; CHECK: store i32 %tmp14_p_scalar_, ptr %tmp14.s2a
 ; CHECK: %tmp14.final_reload = load i32, ptr %tmp14.s2a
diff --git a/polly/test/CodeGen/constant_condition.ll b/polly/test/CodeGen/constant_condition.ll
index 9d3c5a811b16a..905aa52df5080 100644
--- a/polly/test/CodeGen/constant_condition.ll
+++ b/polly/test/CodeGen/constant_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -disable-output < %s 2>&1 | FileCheck %s
 
 ;#include <string.h>
 ;int A[1];
diff --git a/polly/test/CodeGen/create-conditional-scop.ll b/polly/test/CodeGen/create-conditional-scop.ll
index d4df48b757d3d..b8c9a81b71a91 100644
--- a/polly/test/CodeGen/create-conditional-scop.ll
+++ b/polly/test/CodeGen/create-conditional-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -verify-loop-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -verify-loop-info < %s -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
index 31b5e69ae4c6a..dfef4202391d4 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Check we do not crash even though the dead %tmp8 is referenced by a parameter
 ; and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
index 88b844bea5e4e..fcc6764ce9c21 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Check we do not crash even though there is a dead load that is referenced by
 ; a parameter and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll
index f397a4b83d88a..ed4b81a8e3a3c 100644
--- a/polly/test/CodeGen/debug-intrinsics.ll
+++ b/polly/test/CodeGen/debug-intrinsics.ll
@@ -1,6 +1,10 @@
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen -S < %s | \
+; RUN: FileCheck %s
 
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen -S < %s | \
+; RUN: FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
index 7f6f128c2cff2..edc03333a358d 100644
--- a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
+++ b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; This caused dominance problems at some point as we do bail out during
 ; code generation. Just verify it runs through.
diff --git a/polly/test/CodeGen/empty_domain_in_context.ll b/polly/test/CodeGen/empty_domain_in_context.ll
index f6c39eb0517bc..a2fe805f402e0 100644
--- a/polly/test/CodeGen/empty_domain_in_context.ll
+++ b/polly/test/CodeGen/empty_domain_in_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;opt-isl;codegen>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-optree,polly-opt-isl,polly-codegen' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR35362
 ; isl codegen does not allow to generate isl_ast_expr from pw_aff which have an
diff --git a/polly/test/CodeGen/entry_with_trivial_phi.ll b/polly/test/CodeGen/entry_with_trivial_phi.ll
index 09570938a9ca1..f2c9da04d6495 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node. LCSSA may create such PHI nodes. This is a breakdown of this case in
diff --git a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
index 7d8ef7acf9435..2f1ec1a7872aa 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node that is used in a different of the scop region. LCSSA may create such
diff --git a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
index c5c11c8ea2f8f..63b6becd19574 100644
--- a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
+++ b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; CHECK-LABEL: polly.stmt.if.then:
diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
index 1e38210c733d9..abec28894f45b 100644
--- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
+++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/exprModDiv.ll b/polly/test/CodeGen/exprModDiv.ll
index b123e90c07882..c9b419abe3242 100644
--- a/polly/test/CodeGen/exprModDiv.ll
+++ b/polly/test/CodeGen/exprModDiv.ll
@@ -1,5 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=pow2 -S < %s | FileCheck %s -check-prefix=POW2
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -polly-import-jscop-postfix=pow2 \
+; RUN:     -S < %s | FileCheck %s -check-prefix=POW2
 ;
 ;    void exprModDiv(float *A, float *B, float *C, long N, long p) {
 ;      for (long i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
index c7873baeeaeb7..1ca2413fd5e19 100644
--- a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
+++ b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=false < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
+; RUN: -polly-invariant-load-hoisting=false < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; Check that we generate valid code even if the load of cont_STACKPOINTER is
 ; hoisted in one SCoP and used (through the phi node %tmp2).
diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll
index 31ae969cd3156..aa29bfd7dbcbc 100644
--- a/polly/test/CodeGen/hoisting_1.ll
+++ b/polly/test/CodeGen/hoisting_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll
index eb6f7ae5ff6d1..1b913f2cb07be 100644
--- a/polly/test/CodeGen/hoisting_2.ll
+++ b/polly/test/CodeGen/hoisting_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/inner_scev_sdiv_1.ll b/polly/test/CodeGen/inner_scev_sdiv_1.ll
index f7595a6afb0be..d210105c46baf 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_1.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
 ;
 ; Excerpt from the test-suite's oggenc reduced using bugpoint.
 ;
diff --git a/polly/test/CodeGen/inner_scev_sdiv_2.ll b/polly/test/CodeGen/inner_scev_sdiv_2.ll
index 247c102834b25..33233fe2fdf17 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_2.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; The SCEV expression in this test case refers to a sequence of sdiv
 ; instructions, which are part of different bbs in the SCoP. When code
diff --git a/polly/test/CodeGen/inner_scev_sdiv_3.ll b/polly/test/CodeGen/inner_scev_sdiv_3.ll
index fc1cce41c0f4e..a8c626347efe9 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_3.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; This test case has a inner SCEV sdiv that will escape the SCoP. Just check we
 ; do not crash and generate valid code.
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
index 1ff598a4a021a..31c14e85f253e 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; CHECK: [N] -> { Stmt_bb11[i0, i1] : i0 < N and i1 >= 0 and 3i1 <= -3 + i0 };
 ; CODEGEN: polly
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
index 4cd146ddbf62e..b42371b0891e6 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
+; RUN:     < %s | FileCheck %s
 ;
 ; Check that this will not crash our code generation.
 ;
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
index 586875bbefcbe..45af63402c986 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 ;
 ; This will just check that we generate valid code here.
 ;
diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll
index 0f35664eb7e1c..a708548798ebb 100644
--- a/polly/test/CodeGen/intrinsics_lifetime.ll
+++ b/polly/test/CodeGen/intrinsics_lifetime.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that we remove the lifetime markers from everywhere.
 ;
diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll
index 4a64c1a641182..a643b8accd4e9 100644
--- a/polly/test/CodeGen/intrinsics_misc.ll
+++ b/polly/test/CodeGen/intrinsics_misc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that we remove the misc intrinsics  from the optimized SCoP.
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
index 15fe0d9e22416..e7cbf748bea73 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
index c1ab026e97701..24e9240c234d1 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
index f0c833ce1bce1..d1d861e316ee4 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
 ;
diff --git a/polly/test/CodeGen/invariant-load-dimension.ll b/polly/test/CodeGen/invariant-load-dimension.ll
index 13576b9f40455..21e53055c56b0 100644
--- a/polly/test/CodeGen/invariant-load-dimension.ll
+++ b/polly/test/CodeGen/invariant-load-dimension.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-process-unprofitable -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S < %s -passes=polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
 
diff --git a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
index d92d97012b33c..1fd9cb81771c6 100644
--- a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
+++ b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check that we generate valid code as we did non preload the base pointer
 ; origin of %tmp4 at some point.
diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll
index f6dcac08dffca..420cb608f9ba4 100644
--- a/polly/test/CodeGen/invariant_cannot_handle_void.ll
+++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; The offset of the %tmp1 load wrt. to %buff (62 bytes) is not divisible
 ; by the type size (i32 = 4 bytes), thus we will have to represent %buff
diff --git a/polly/test/CodeGen/invariant_load.ll b/polly/test/CodeGen/invariant_load.ll
index c89da73efc839..2d5e6042ea6a4 100644
--- a/polly/test/CodeGen/invariant_load.ll
+++ b/polly/test/CodeGen/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll
index 7d5139cc55f88..3d1958e5b8a43 100644
--- a/polly/test/CodeGen/invariant_load_address_space.ll
+++ b/polly/test/CodeGen/invariant_load_address_space.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr addrspace(1) %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_alias_metadata.ll b/polly/test/CodeGen/invariant_load_alias_metadata.ll
index 2a704ee9c576a..252463384a5c8 100644
--- a/polly/test/CodeGen/invariant_load_alias_metadata.ll
+++ b/polly/test/CodeGen/invariant_load_alias_metadata.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true \
+; RUN: -S < %s | FileCheck %s
 ;
 ; This test case checks whether Polly generates alias metadata in case of
 ; the ublas gemm kernel and polly-invariant-load-hoisting.
diff --git a/polly/test/CodeGen/invariant_load_base_pointer.ll b/polly/test/CodeGen/invariant_load_base_pointer.ll
index f6b873994036c..d4ac433475f05 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.BPLoc = getelementptr ptr, ptr %BPLoc, i64 0
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
index 4dbcc3b3b049d..06a9a93363ed9 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %0 = sext i32 %N to i64
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
index 39520c8fd8217..66ab9a31b1032 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
index 414ca127a251f..fa904e9b96d34 100644
--- a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; CHECK: %polly.access.A = getelementptr ptr, ptr %A, i64 0
 ; CHECK: %polly.access.A.load = load ptr, ptr %polly.access.A
diff --git a/polly/test/CodeGen/invariant_load_condition.ll b/polly/test/CodeGen/invariant_load_condition.ll
index f0782c023378b..36e588329d669 100644
--- a/polly/test/CodeGen/invariant_load_condition.ll
+++ b/polly/test/CodeGen/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:     %polly.access.C = getelementptr i32, ptr %C, i64 0
diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll
index 034c3587a0708..0a88bb70966d2 100644
--- a/polly/test/CodeGen/invariant_load_different_sized_types.ll
+++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S -polly-allow-differing-element-types < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S \
+; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll
index 85578d3ba0992..416148b72303b 100644
--- a/polly/test/CodeGen/invariant_load_escaping.ll
+++ b/polly/test/CodeGen/invariant_load_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ;    int f(int *A, int *B) {
 ;      // Possible aliasing between A and B but if not then *B would be
diff --git a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
index ff6e9a8e3ddae..906bfc1805d39 100644
--- a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
+++ b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true  -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ;    void fence(void);
 ;
diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
index edd38cab2afba..ab02e639f0d2a 100644
--- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
+++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; This crashed at some point as the invariant load is in a non-affine
 ; subregion. Just check it does not anymore.
diff --git a/polly/test/CodeGen/invariant_load_loop_ub.ll b/polly/test/CodeGen/invariant_load_loop_ub.ll
index 923102440c547..1db27ad8e58ba 100644
--- a/polly/test/CodeGen/invariant_load_loop_ub.ll
+++ b/polly/test/CodeGen/invariant_load_loop_ub.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK: polly.start
 ;
diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
index 0e381b863fb8b..5a11adcdebbc5 100644
--- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
+++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Check that this does not crash as the invariant load is not executed (thus
 ; not preloaded) but still referenced by one of the parameters.
diff --git a/polly/test/CodeGen/invariant_load_outermost.ll b/polly/test/CodeGen/invariant_load_outermost.ll
index bbbe1f1663964..7e0550fb3be94 100644
--- a/polly/test/CodeGen/invariant_load_outermost.ll
+++ b/polly/test/CodeGen/invariant_load_outermost.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
index 9fe343f752d14..abf957b556daa 100644
--- a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
+++ b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; SCOP:         Assumed Context:
 ; SCOP-NEXT:    [p_0, tmp4] -> {  :  }
diff --git a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
index dc1c2bca4b6e3..b565f1bd5096a 100644
--- a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S  < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:   %polly.access.A = getelementptr ptr, ptr %A, i64 42
diff --git a/polly/test/CodeGen/invariant_load_scalar_dep.ll b/polly/test/CodeGen/invariant_load_scalar_dep.ll
index bb60c50b1ab40..ba2999e27984d 100644
--- a/polly/test/CodeGen/invariant_load_scalar_dep.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
index 87c407e05b972..26c964c9c6a72 100644
--- a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; Verify the preloaded %tmp0 is stored and communicated in the same alloca.
 ; In this case, we do not reload %ncol.load from the scalar stack slot, but
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
index 5e2b28c53019e..6bf11d5697bd7 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
index 20d9f6d40b7d6..07ce941522459 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
index 51f8a55d1a400..19b30afd33ba7 100644
--- a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
+++ b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting -polly-ignore-parameter-bounds -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting \
+; RUN:     -polly-ignore-parameter-bounds -S < %s | FileCheck %s
 
 ; CHECK: polly.preload.begin:
 ; CHECK-NEXT: %global.load = load i32, ptr @global, align 4, !alias.scope !0, !noalias !3
diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll
index 432c155fdd3ae..1dcc175ebb163 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This crashed at some point as the pointer returned by the call
 ; to @__errno_location is invariant and defined in the SCoP but not
diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
index 65ba2cd993193..43b3d99e11a2f 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; Check we generate valid code.
 
diff --git a/polly/test/CodeGen/issue56692.ll b/polly/test/CodeGen/issue56692.ll
index 5e225d73bdcd3..34c4e398e2ac0 100644
--- a/polly/test/CodeGen/issue56692.ll
+++ b/polly/test/CodeGen/issue56692.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -passes=polly-codegen -S < %s | FileCheck %s
 ; https://github.com/llvm/llvm-project/issues/56692
 ;
 ; CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}), !dbg ![[OPTLOC:[0-9]+]]
diff --git a/polly/test/CodeGen/large-numbers-in-boundary-context.ll b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
index 4d55273618df6..b228baf9bdf22 100644
--- a/polly/test/CodeGen/large-numbers-in-boundary-context.ll
+++ b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; The boundary context contains a constant that does not fit in 64 bits. Hence,
diff --git a/polly/test/CodeGen/load_subset_with_context.ll b/polly/test/CodeGen/load_subset_with_context.ll
index 33b3d3b72225f..ccd4198b9fe85 100644
--- a/polly/test/CodeGen/load_subset_with_context.ll
+++ b/polly/test/CodeGen/load_subset_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; A load must provide a value for every statement instance.
 ; Statement instances not in the SCoP's context are irrelevant.
diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
index dc0c5517d7ca5..f43247b3e5057 100644
--- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
+++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/loop_with_condition.ll b/polly/test/CodeGen/loop_with_condition.ll
index cf28a4de63f3b..49e312404cca8 100644
--- a/polly/test/CodeGen/loop_with_condition.ll
+++ b/polly/test/CodeGen/loop_with_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_2.ll b/polly/test/CodeGen/loop_with_condition_2.ll
index 1d8a8132a79cb..8ae38eeeb4982 100644
--- a/polly/test/CodeGen/loop_with_condition_2.ll
+++ b/polly/test/CodeGen/loop_with_condition_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 
 ; Verify that we actually detect this loop as the innermost loop even though
 ; there is a conditional inside.
diff --git a/polly/test/CodeGen/loop_with_condition_ineq.ll b/polly/test/CodeGen/loop_with_condition_ineq.ll
index c222f67ed7836..64019a6090212 100644
--- a/polly/test/CodeGen/loop_with_condition_ineq.ll
+++ b/polly/test/CodeGen/loop_with_condition_ineq.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_nested.ll b/polly/test/CodeGen/loop_with_condition_nested.ll
index 32256a7344664..5dcb51dcb91cd 100644
--- a/polly/test/CodeGen/loop_with_condition_nested.ll
+++ b/polly/test/CodeGen/loop_with_condition_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 
 
 ;#include <string.h>
diff --git a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
index 5d7f67f1f9060..26fe4eb82ae49 100644
--- a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
+++ b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Test case to trigger the hard way of creating a unique entering
 ; edge for the SCoP. It is triggered because the entering edge
diff --git a/polly/test/CodeGen/memcpy_annotations.ll b/polly/test/CodeGen/memcpy_annotations.ll
index c3ffe4abcddd6..501aa8fbea4d6 100644
--- a/polly/test/CodeGen/memcpy_annotations.ll
+++ b/polly/test/CodeGen/memcpy_annotations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that @llvm.memcpy does not get a !alias.scope annotation.
 ; @llvm.memcpy takes two pointers, it is ambiguous to which the
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
index b084672971855..f63eb18118e77 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 ; CHECK: polly
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize.ll b/polly/test/CodeGen/multidim-non-matching-typesize.ll
index 66a4fdf42bc8e..63e43c83ada5f 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly --aa-pipeline= '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 
diff --git a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
index d3f8b718889e4..86b17573caada 100644
--- a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/CodeGen/multidim_alias_check.ll b/polly/test/CodeGen/multidim_alias_check.ll
index e85d7c9e7785d..93e34e2fd0fc1 100644
--- a/polly/test/CodeGen/multidim_alias_check.ll
+++ b/polly/test/CodeGen/multidim_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: %polly.access.sext.A = sext i32 %n to i64
diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll
index cb12700bfb561..a63f8a615ff9e 100644
--- a/polly/test/CodeGen/multiple-codegens.ll
+++ b/polly/test/CodeGen/multiple-codegens.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>,polly<no-default-opts>' -S < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=function(polly<no-default-opts;opt-isl>),function(polly<no-default-opts>)' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen,polly-codegen)" -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen),scop(polly-codegen)" -S < %s | FileCheck %s
 ;
 ; llvm.org/PR34441
 ; Properly handle multiple -polly-scops/-polly-codegen in the same
diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll
index b92359782d999..effae223c152a 100644
--- a/polly/test/CodeGen/multiple-scops-in-a-row.ll
+++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 
 ; This test case has two scops in a row. When code generating the first scop,
 ; the second scop is invalidated. This test case verifies that we do not crash
diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
index 96615079be365..101fcaff0c82e 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-allow-differing-element-types < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
 
 ; CHECK: polly
 
diff --git a/polly/test/CodeGen/multiple-types-invariant-load.ll b/polly/test/CodeGen/multiple-types-invariant-load.ll
index ca89cb53e09b7..930041eaddaad 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; CHECK: %polly.access.global.load = getelementptr i32, ptr %global.load, i64 0
 ; CHECK: %polly.access.global.load.load = load i32, ptr %polly.access.global.load
diff --git a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
index 8198108b22059..1e06a7e186bb0 100644
--- a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
+++ b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 
 ; The IR has two ScopArrayInfo for the value %next.0. This used to produce two
 ; phi nodes in polly.merge_new_and_old, one illegaly using the result of the
diff --git a/polly/test/CodeGen/no-overflow-tracking.ll b/polly/test/CodeGen/no-overflow-tracking.ll
index f915b5a0772e6..d5ad9a7aef239 100644
--- a/polly/test/CodeGen/no-overflow-tracking.ll
+++ b/polly/test/CodeGen/no-overflow-tracking.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/no_guard_bb.ll b/polly/test/CodeGen/no_guard_bb.ll
index 604c5ac54bcdb..a022083f43a9e 100644
--- a/polly/test/CodeGen/no_guard_bb.ll
+++ b/polly/test/CodeGen/no_guard_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s
 ;
 ; CHECK-NOT: br i1 true, label %polly.{{.*}}, label %polly.{{.*}}
 ;
diff --git a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
index ebb02a90ffb5d..6015516a3bc49 100644
--- a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
+++ b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; Scalar reloads in the generated entering block were not recognized as
diff --git a/polly/test/CodeGen/non-affine-exit-node-dominance.ll b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
index ff9f504295672..0d0f634ed7c16 100644
--- a/polly/test/CodeGen/non-affine-exit-node-dominance.ll
+++ b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; The dominance of the generated non-affine subregion block was based on the
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
index 2ad1e75216362..bfa3c156ea75d 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
index 386fe5f9f207f..b9386333a79b4 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
 entry:
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
index 5e5f34d99bde3..6460c427270f4 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
 entry:
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion.ll b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
index db9f0d518041b..1b6802f1a4c35 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.wombat = type {[4 x i32]}
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
index 096eb8609e1bb..007a4c586aa32 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same operand
 ; of the PHI node in the non-affine region was synthesized at the wrong place.
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
index 2810a8ab5361f..20edbf2bd6c03 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same BBMap was
 ; used for the whole non-affine region. When %add is synthesized for the
diff --git a/polly/test/CodeGen/non-affine-region-implicit-store.ll b/polly/test/CodeGen/non-affine-region-implicit-store.ll
index cdb2000d90d6b..0ff39d3fe882d 100644
--- a/polly/test/CodeGen/non-affine-region-implicit-store.ll
+++ b/polly/test/CodeGen/non-affine-region-implicit-store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25438
 ; After loop versioning, a dominance check of a non-affine subregion's exit node
diff --git a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
index b4889c76079cc..7df3d8976ea80 100644
--- a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
+++ b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops \
+; RUN: -S < %s | FileCheck %s
 
 ; This test verifies that values defined in another scop statement and used by
 ; PHI-nodes in non-affine regions are code generated correctly.
diff --git a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
index 45465c627f55a..179062dd62d0a 100644
--- a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
+++ b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -verify-dom-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info \
+; RUN:     < %s | FileCheck %s
 ;
 ; Check that we do not reuse the B[i-1] GEP created in block S again in
 ; block Q. Hence, we create two GEPs for B[i-1]:
diff --git a/polly/test/CodeGen/non-affine-switch.ll b/polly/test/CodeGen/non-affine-switch.ll
index 90d5efdc3a9f5..427e7e2461f1d 100644
--- a/polly/test/CodeGen/non-affine-switch.ll
+++ b/polly/test/CodeGen/non-affine-switch.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
index 5bb4fd19f4fd1..292c0f2b53941 100644
--- a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
+++ b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25412
 ; %synthgep caused %gep to be synthesized in subregion_if which was reused for
diff --git a/polly/test/CodeGen/non-affine-update.ll b/polly/test/CodeGen/non-affine-update.ll
index 582607787eb7d..03f091a405017 100644
--- a/polly/test/CodeGen/non-affine-update.ll
+++ b/polly/test/CodeGen/non-affine-update.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -S < %s | FileCheck %s
 ;
 ;    void non-affine-update(double A[], double C[], double B[]) {
 ;      for (int i = 0; i < 10; i++) {
diff --git a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
index eaf74d9c63e0e..153cdb7ed9f6c 100644
--- a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
+++ b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non_affine_float_compare.ll b/polly/test/CodeGen/non_affine_float_compare.ll
index 9709e231a4e86..a359b662e6579 100644
--- a/polly/test/CodeGen/non_affine_float_compare.ll
+++ b/polly/test/CodeGen/non_affine_float_compare.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-branches -S -verify-dom-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
+; RUN:     -polly-allow-nonaffine-branches -S -verify-dom-info \
+; RUN:     < %s | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/only_non_affine_error_region.ll b/polly/test/CodeGen/only_non_affine_error_region.ll
index be7a8a23df869..445cef0d6f697 100644
--- a/polly/test/CodeGen/only_non_affine_error_region.ll
+++ b/polly/test/CodeGen/only_non_affine_error_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK-NOT: polly.start
 ;
diff --git a/polly/test/CodeGen/openmp_limit_threads.ll b/polly/test/CodeGen/openmp_limit_threads.ll
index 730c57299d569..4c33be3407251 100644
--- a/polly/test/CodeGen/openmp_limit_threads.ll
+++ b/polly/test/CodeGen/openmp_limit_threads.ll
@@ -1,10 +1,10 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
 
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
 
 ; Ensure that the provided thread numbers are forwarded to the OpenMP calls.
 ;
diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
index 8d5f74751af49..dd0a24b14a3b8 100644
--- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll
+++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/param_div_div_div_2.ll b/polly/test/CodeGen/param_div_div_div_2.ll
index 3ae95020d52dd..8eba6444abb16 100644
--- a/polly/test/CodeGen/param_div_div_div_2.ll
+++ b/polly/test/CodeGen/param_div_div_div_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; Check that we guard the divisions because we moved them and thereby increased
 ; their domain.
diff --git a/polly/test/CodeGen/partial_write_array.ll b/polly/test/CodeGen/partial_write_array.ll
index fe5fd8cffece7..fad4b21cf3dc8 100644
--- a/polly/test/CodeGen/partial_write_array.ll
+++ b/polly/test/CodeGen/partial_write_array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of an array access.
 ;
diff --git a/polly/test/CodeGen/partial_write_emptyset.ll b/polly/test/CodeGen/partial_write_emptyset.ll
index d0e5615e4220d..67828808e2fac 100644
--- a/polly/test/CodeGen/partial_write_emptyset.ll
+++ b/polly/test/CodeGen/partial_write_emptyset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write, where "partial" is the empty set.
 ; The store is never executed in this case and we do generate it in the
diff --git a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
index a36414297485a..b26bd81b5663b 100644
--- a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
+++ b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK:      polly.stmt.if.then81:                             ; preds = %polly.stmt.if.end75
 ; CHECK-NEXT:   store float undef, ptr %fX64, align 4, !alias.scope !0, !noalias !3
diff --git a/polly/test/CodeGen/partial_write_impossible_restriction.ll b/polly/test/CodeGen/partial_write_impossible_restriction.ll
index e0069ebc8eae8..7577b137a2750 100644
--- a/polly/test/CodeGen/partial_write_impossible_restriction.ll
+++ b/polly/test/CodeGen/partial_write_impossible_restriction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; The isl scheduler isolates %cond.false into two instances.
 ; A partial write access in one of the instances was never executed,
diff --git a/polly/test/CodeGen/partial_write_in_region.ll b/polly/test/CodeGen/partial_write_in_region.ll
index e7f4225cf9310..7c138c82091e5 100644
--- a/polly/test/CodeGen/partial_write_in_region.ll
+++ b/polly/test/CodeGen/partial_write_in_region.ll
@@ -1,4 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-import-jscop-postfix=transformed \
+; RUN: -verify-dom-info \
+; RUN: -S < %s | FileCheck %s
 ;
 ;    void foo(long A[], float B[], float C[]) {
 ;      for (long i = 0; i < 1024; i++) {
diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
index 85b56fefad809..ba15a7871f431 100644
--- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll
+++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
@@ -1,4 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -verify-dom-info -polly-allow-nonaffine-loops -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-import-jscop-postfix=transformed \
+; RUN: -verify-dom-info -polly-allow-nonaffine-loops \
+; RUN: -S < %s | FileCheck %s
 
 ; This test verifies that partial writes within non-affine loops are code
 ; generated correctly.
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar.ll b/polly/test/CodeGen/partial_write_mapped_scalar.ll
index bb99d4ea086d2..b8c413885cdb0 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar.
 ;
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
index 37a9d98c6a22e..8c1953a05ad3c 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar in a non-affine subregion.
 ;
diff --git a/polly/test/CodeGen/perf_monitoring.ll b/polly/test/CodeGen/perf_monitoring.ll
index 61f122228c377..4b91e5055c0b1 100644
--- a/polly/test/CodeGen/perf_monitoring.ll
+++ b/polly/test/CodeGen/perf_monitoring.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
+; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
index 4c47a12c12904..d5c33d64f3418 100644
--- a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
+; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
index 6d09d8bf27ebe..ab99c4d2de062 100644
--- a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-codegen-perf-monitoring -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
+; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll
index 2ccd7965bbeaf..447a14e9999c2 100644
--- a/polly/test/CodeGen/phi-defined-before-scop.ll
+++ b/polly/test/CodeGen/phi-defined-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK-LABEL: polly.merge_new_and_old:
 ; CHECK-NEXT: %tmp7.ph.merge = phi ptr [ %tmp7.ph.final_reload, %polly.exiting ], [ %tmp7.ph, %bb6.region_exiting ]
diff --git a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
index 1655104b08390..e096aa2f4f8c0 100644
--- a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
+++ b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Make sure code generation does not break in case an 'error block' is detected
 ; outside of the scope. In this situation, we should not affect code generation.
diff --git a/polly/test/CodeGen/phi_condition_modeling_1.ll b/polly/test/CodeGen/phi_condition_modeling_1.ll
index 1cadac0a5cf73..9d73d8a792558 100644
--- a/polly/test/CodeGen/phi_condition_modeling_1.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_condition_modeling_2.ll b/polly/test/CodeGen/phi_condition_modeling_2.ll
index 8f2e2a517c96c..2d1364842d735 100644
--- a/polly/test/CodeGen/phi_condition_modeling_2.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_conditional_simple_1.ll b/polly/test/CodeGen/phi_conditional_simple_1.ll
index 5f0f8de19f223..25bcf2a118ef4 100644
--- a/polly/test/CodeGen/phi_conditional_simple_1.ll
+++ b/polly/test/CodeGen/phi_conditional_simple_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
index 703e55f15c084..43d29b9ec8649 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through.
 ;
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
index 3d911e0d6a87f..9f28024fcfa0a 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
index 5f81f52078723..73e99ac0f32c5 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
index abb86e650ce2a..6c9bd56a98722 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_loop_carried_float.ll b/polly/test/CodeGen/phi_loop_carried_float.ll
index 47a8a8190c8d9..4cb392d3353d3 100644
--- a/polly/test/CodeGen/phi_loop_carried_float.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_loop_carried_float_escape.ll b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
index 81dd5cecd1878..9fd8ad413128a 100644
--- a/polly/test/CodeGen/phi_loop_carried_float_escape.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
@@ -1,6 +1,8 @@
-; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S \
+; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen < %s | FileCheck %s
 
-; RUN: opt %loadNPMPolly -S -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S \
+; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_scalar_simple_1.ll b/polly/test/CodeGen/phi_scalar_simple_1.ll
index 6331c24da31b0..80a1c41b83ac0 100644
--- a/polly/test/CodeGen/phi_scalar_simple_1.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N) {
 ;      for (int i = 1; i < N; i++)
diff --git a/polly/test/CodeGen/phi_scalar_simple_2.ll b/polly/test/CodeGen/phi_scalar_simple_2.ll
index 0adadf6b90159..614c8acfb9f8e 100644
--- a/polly/test/CodeGen/phi_scalar_simple_2.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
index 4d6ede638c8f2..7e21666f1db00 100644
--- a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
+++ b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK: polly.merge_new_and_old:
 ; CHECK:   %result.ph.merge = phi float [ %result.ph.final_reload, %polly.exiting ], [ %result.ph, %next.region_exiting ]
diff --git a/polly/test/CodeGen/phi_with_one_exit_edge.ll b/polly/test/CodeGen/phi_with_one_exit_edge.ll
index 4de24fb058c26..36a8684dbc37a 100644
--- a/polly/test/CodeGen/phi_with_one_exit_edge.ll
+++ b/polly/test/CodeGen/phi_with_one_exit_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;
 ; CHECK: polly.merge_new_and_old:
diff --git a/polly/test/CodeGen/pointer-type-expressions-2.ll b/polly/test/CodeGen/pointer-type-expressions-2.ll
index 706b01d7f8ca5..918e4c6c9c0b0 100644
--- a/polly/test/CodeGen/pointer-type-expressions-2.ll
+++ b/polly/test/CodeGen/pointer-type-expressions-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo(ptr %start, ptr %end) {
diff --git a/polly/test/CodeGen/pointer-type-expressions.ll b/polly/test/CodeGen/pointer-type-expressions.ll
index 2478e2238fd0e..e7feebc163d4b 100644
--- a/polly/test/CodeGen/pointer-type-expressions.ll
+++ b/polly/test/CodeGen/pointer-type-expressions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
index cac6f4fdd16f1..9ee050a1e5070 100644
--- a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
+++ b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 
 ;    void f(int a[], int N, float *P, float *Q) {
diff --git a/polly/test/CodeGen/pointer_rem.ll b/polly/test/CodeGen/pointer_rem.ll
index ca5d866ae6cce..b8202318a3eca 100644
--- a/polly/test/CodeGen/pointer_rem.ll
+++ b/polly/test/CodeGen/pointer_rem.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<ast>' -polly-print-scops -polly-print-ast -disable-output -S < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<codegen>' -polly-print-scops -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(print<polly-ast>)' -disable-output -S < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(polly-codegen)' -S < %s | FileCheck %s --check-prefix=CODEGEN
 
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll
index 94be6d7824921..7547b0bbed749 100644
--- a/polly/test/CodeGen/pr25241.ll
+++ b/polly/test/CodeGen/pr25241.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; PR25241 (https://llvm.org/bugs/show_bug.cgi?id=25241)
 ; Ensure that synthesized values of a PHI node argument are generated in the
diff --git a/polly/test/CodeGen/ptrtoint_as_parameter.ll b/polly/test/CodeGen/ptrtoint_as_parameter.ll
index 49a8c38309eb2..a551d810c0802 100644
--- a/polly/test/CodeGen/ptrtoint_as_parameter.ll
+++ b/polly/test/CodeGen/ptrtoint_as_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK:      if.then260:
 ; CHECK-NEXT:   %p.4 = getelementptr inbounds i8, ptr null, i64 1
diff --git a/polly/test/CodeGen/read-only-scalars.ll b/polly/test/CodeGen/read-only-scalars.ll
index 2ae0f9e797bd1..365cbbce495fb 100644
--- a/polly/test/CodeGen/read-only-scalars.ll
+++ b/polly/test/CodeGen/read-only-scalars.ll
@@ -1,5 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false -passes=polly-codegen \
+; RUN:     \
+; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true -passes=polly-codegen \
+; RUN:     \
+; RUN:     -S < %s | FileCheck %s -check-prefix=SCALAR
 
 ; CHECK-NOT: alloca
 
diff --git a/polly/test/CodeGen/reduction.ll b/polly/test/CodeGen/reduction.ll
index 21d8c0f98b702..8c5f70770a1c5 100644
--- a/polly/test/CodeGen/reduction.ll
+++ b/polly/test/CodeGen/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | not FileCheck %s
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll
index f9576826b4f77..060a1866870e4 100644
--- a/polly/test/CodeGen/reduction_2.ll
+++ b/polly/test/CodeGen/reduction_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --allow-empty
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_simple_binary.ll b/polly/test/CodeGen/reduction_simple_binary.ll
index 53cbdf407c954..0fe1085dbbacd 100644
--- a/polly/test/CodeGen/reduction_simple_binary.ll
+++ b/polly/test/CodeGen/reduction_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: pragma simd reduction
 ;
diff --git a/polly/test/CodeGen/reggen_domtree_crash.ll b/polly/test/CodeGen/reggen_domtree_crash.ll
index 9d5ba4c4ff9fb..58c27091a22c3 100644
--- a/polly/test/CodeGen/reggen_domtree_crash.ll
+++ b/polly/test/CodeGen/reggen_domtree_crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
 
 ; CHECK: define ptr @ham(ptr %arg, i64 %arg1, i1 %arg2)
 
diff --git a/polly/test/CodeGen/region-with-instructions.ll b/polly/test/CodeGen/region-with-instructions.ll
index f061ac061e226..e5f7d0f9ef5d6 100644
--- a/polly/test/CodeGen/region-with-instructions.ll
+++ b/polly/test/CodeGen/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK-LABEL:   polly.stmt.bb48:
 ; CHECK-NEXT:   %[[offset:.*]] = shl i64 %polly.indvar, 3
diff --git a/polly/test/CodeGen/region_exiting-domtree.ll b/polly/test/CodeGen/region_exiting-domtree.ll
index 16b265c064790..06e0d9df3d951 100644
--- a/polly/test/CodeGen/region_exiting-domtree.ll
+++ b/polly/test/CodeGen/region_exiting-domtree.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 
 ; Verify that the DominatorTree is preserved correctly for the inserted
 ; %polly.stmt.exit.exit block, which serves as new exit block for the generated
diff --git a/polly/test/CodeGen/region_multiexit_partialwrite.ll b/polly/test/CodeGen/region_multiexit_partialwrite.ll
index 9d21d16c9f9cd..39e04dbf93ac7 100644
--- a/polly/test/CodeGen/region_multiexit_partialwrite.ll
+++ b/polly/test/CodeGen/region_multiexit_partialwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;codegen>' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; This text case has a partial write of PHI in a region-statement. It
 ; requires that the new PHINode from the region's exiting block is
diff --git a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
index 7984b7ce80209..4afaab5bbad0a 100644
--- a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
+++ b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; TODO: FIXME: Simplify the context.
 ; AST: if (n >= 1 && 0 == n <= -1)
diff --git a/polly/test/CodeGen/run-time-condition.ll b/polly/test/CodeGen/run-time-condition.ll
index 44d2a4f15b378..914b76f5e0be7 100644
--- a/polly/test/CodeGen/run-time-condition.ll
+++ b/polly/test/CodeGen/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
index 102ef04128133..77306c1046133 100644
--- a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
+++ b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Test the code generation in the presence of a scalar out-of-scop value being
 ; used from within the SCoP.
diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll
index 1988f77086c8a..0c1164b245a43 100644
--- a/polly/test/CodeGen/scalar-store-from-same-bb.ll
+++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test ensures that the expression N + 1 that is stored in the phi-node
 ; alloca, is directly computed and not incorrectly transferred through memory.
diff --git a/polly/test/CodeGen/scalar_codegen_crash.ll b/polly/test/CodeGen/scalar_codegen_crash.ll
index 0179072391a33..375f097283b07 100644
--- a/polly/test/CodeGen/scalar_codegen_crash.ll
+++ b/polly/test/CodeGen/scalar_codegen_crash.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN:     -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test cases used to crash the scalar code generation. Check that we
 ; can generate code for it.
diff --git a/polly/test/CodeGen/scev-backedgetaken.ll b/polly/test/CodeGen/scev-backedgetaken.ll
index 09fcfe3e4a09c..e0941690ae489 100644
--- a/polly/test/CodeGen/scev-backedgetaken.ll
+++ b/polly/test/CodeGen/scev-backedgetaken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR48422
 ; Use of ScalarEvolution in Codegen not possible because DominatorTree is not updated.
diff --git a/polly/test/CodeGen/scev-division-invariant-load.ll b/polly/test/CodeGen/scev-division-invariant-load.ll
index 5942ecbe7cee9..70f090eae07b3 100644
--- a/polly/test/CodeGen/scev-division-invariant-load.ll
+++ b/polly/test/CodeGen/scev-division-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
 ;
 ; Check that we generate valid code as we did not use the preloaded
 ; value of %tmp1 for the access function of the preloaded %tmp4.
diff --git a/polly/test/CodeGen/scev.ll b/polly/test/CodeGen/scev.ll
index a09d8c5504b1b..e2b5afda1bfff 100644
--- a/polly/test/CodeGen/scev.ll
+++ b/polly/test/CodeGen/scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @f () inlinehint align 2 {
diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
index 095c362024a83..0adb0ba7eea81 100644
--- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
+++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; bugpoint-reduced testcase of MiBench/consumer-lame/quantize-pvt.c from the
 ; test-suite.
diff --git a/polly/test/CodeGen/scev_looking_through_bitcasts.ll b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
index 81f4b96d22a37..142e83f820fe7 100644
--- a/polly/test/CodeGen/scev_looking_through_bitcasts.ll
+++ b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Scalar write of bitcasted value. Instead of writing %b of type
 ; %structty, the SCEV expression looks through the bitcast such that
diff --git a/polly/test/CodeGen/scop_expander_insert_point.ll b/polly/test/CodeGen/scop_expander_insert_point.ll
index 1cba7567a5e43..fd73132258ddc 100644
--- a/polly/test/CodeGen/scop_expander_insert_point.ll
+++ b/polly/test/CodeGen/scop_expander_insert_point.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; CHECK:      entry:
 ; CHECK-NEXT:   %outvalue.141.phiops = alloca i64
diff --git a/polly/test/CodeGen/scop_expander_segfault.ll b/polly/test/CodeGen/scop_expander_segfault.ll
index 56d37a0175853..d94a1fdfb2c12 100644
--- a/polly/test/CodeGen/scop_expander_segfault.ll
+++ b/polly/test/CodeGen/scop_expander_segfault.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S %s | FileCheck %s
 ;
 ; This test was extracted from gcc in SPEC2006 and it crashed our code
 ; generation, or to be more precise, the ScopExpander due to a endless
diff --git a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
index cdcfe838fa915..9f968e5657c90 100644
--- a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
+++ b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Verify that we generate the runtime check code after the conditional branch
 ; in the SCoP region entering block (here %entry).
diff --git a/polly/test/CodeGen/select-base-pointer.ll b/polly/test/CodeGen/select-base-pointer.ll
index 144c05b5effba..85be37755c474 100644
--- a/polly/test/CodeGen/select-base-pointer.ll
+++ b/polly/test/CodeGen/select-base-pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly<no-default-opts>' -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
 ;
 ; Check that we do not crash here.
 ;
diff --git a/polly/test/CodeGen/sequential_loops.ll b/polly/test/CodeGen/sequential_loops.ll
index eeb3048007859..33a3ee9fbbd47 100644
--- a/polly/test/CodeGen/sequential_loops.ll
+++ b/polly/test/CodeGen/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit.ll b/polly/test/CodeGen/simple_loop_non_single_exit.ll
index 1b3518bdb0cba..a7e36bc4c7330 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
index 3af9913e6aa04..22e9da09ef857 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_non_single_entry.ll b/polly/test/CodeGen/simple_non_single_entry.ll
index 8800dc7214b06..c33a77ae07939 100644
--- a/polly/test/CodeGen/simple_non_single_entry.ll
+++ b/polly/test/CodeGen/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_nonaffine_loop.ll b/polly/test/CodeGen/simple_nonaffine_loop.ll
index 5b1cd1991cd73..bc62047a80a34 100644
--- a/polly/test/CodeGen/simple_nonaffine_loop.ll
+++ b/polly/test/CodeGen/simple_nonaffine_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 
 ;#include <stdio.h>
 ;#include <stdlib.h>
diff --git a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
index f0142f726efa4..a65e3a25f035a 100644
--- a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
index cc5e7b221026c..acccb48f18a3c 100644
--- a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;define N 20
diff --git a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
index 1299362369478..7a67f6ba96ce2 100644
--- a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_one_iteration.ll b/polly/test/CodeGen/single_do_loop_one_iteration.ll
index d025ef2116a40..2d939167b71ee 100644
--- a/polly/test/CodeGen/single_do_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_do_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#define N 20
diff --git a/polly/test/CodeGen/single_do_loop_scev_replace.ll b/polly/test/CodeGen/single_do_loop_scev_replace.ll
index b473e266343a3..83c9e9d0324ce 100644
--- a/polly/test/CodeGen/single_do_loop_scev_replace.ll
+++ b/polly/test/CodeGen/single_do_loop_scev_replace.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop.ll b/polly/test/CodeGen/single_loop.ll
index c04738e6843a0..2db34663e93ce 100644
--- a/polly/test/CodeGen/single_loop.ll
+++ b/polly/test/CodeGen/single_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/single_loop_int_max_iterations.ll b/polly/test/CodeGen/single_loop_int_max_iterations.ll
index 82ec7ffd85462..f83e8823c63df 100644
--- a/polly/test/CodeGen/single_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
index 8affb71fad649..1427189d74a7d 100644
--- a/polly/test/CodeGen/single_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include "limits.h"
 ;#define N 20
diff --git a/polly/test/CodeGen/single_loop_one_iteration.ll b/polly/test/CodeGen/single_loop_one_iteration.ll
index 307b8358ff980..1a70d4a879d83 100644
--- a/polly/test/CodeGen/single_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/single_loop_param.ll b/polly/test/CodeGen/single_loop_param.ll
index 1d78c7a7329d4..44ce1236e9f84 100644
--- a/polly/test/CodeGen/single_loop_param.ll
+++ b/polly/test/CodeGen/single_loop_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer, align 16 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/single_loop_param_less_equal.ll b/polly/test/CodeGen/single_loop_param_less_equal.ll
index 5fad1d43ae0d7..fda9bfab11b8f 100644
--- a/polly/test/CodeGen/single_loop_param_less_equal.ll
+++ b/polly/test/CodeGen/single_loop_param_less_equal.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_param_less_than.ll b/polly/test/CodeGen/single_loop_param_less_than.ll
index 75a8cb2094a16..b888c860eacd0 100644
--- a/polly/test/CodeGen/single_loop_param_less_than.ll
+++ b/polly/test/CodeGen/single_loop_param_less_than.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_zero_iterations.ll b/polly/test/CodeGen/single_loop_zero_iterations.ll
index 3194dba52190b..b1ce491b5c8a2 100644
--- a/polly/test/CodeGen/single_loop_zero_iterations.ll
+++ b/polly/test/CodeGen/single_loop_zero_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/split_edge_of_exit.ll b/polly/test/CodeGen/split_edge_of_exit.ll
index 73d6006a6b621..f4b17e687ada6 100644
--- a/polly/test/CodeGen/split_edge_of_exit.ll
+++ b/polly/test/CodeGen/split_edge_of_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -disable-output < %s
 ;
 ; This is a scop directly precedented by a region, i.e. the scop's entry is the
 ; region's exit block. This test is to ensure that the RegionInfo is correctly
diff --git a/polly/test/CodeGen/split_edges.ll b/polly/test/CodeGen/split_edges.ll
index 03363f49ce800..b921202285bb2 100644
--- a/polly/test/CodeGen/split_edges.ll
+++ b/polly/test/CodeGen/split_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1536 x float] zeroinitializer
diff --git a/polly/test/CodeGen/split_edges_2.ll b/polly/test/CodeGen/split_edges_2.ll
index 59df1618cfd71..8f4d48f5dcb00 100644
--- a/polly/test/CodeGen/split_edges_2.ll
+++ b/polly/test/CodeGen/split_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/srem-in-other-bb.ll b/polly/test/CodeGen/srem-in-other-bb.ll
index 177d86adb9066..a13a1b6ab98f2 100644
--- a/polly/test/CodeGen/srem-in-other-bb.ll
+++ b/polly/test/CodeGen/srem-in-other-bb.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN:     < %s | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
index 5a490b68b9a9f..b49c4e12fe11a 100644
--- a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
+++ b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -verify-dom-info '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -verify-dom-info -passes=polly-codegen -S < %s \
+; RUN: -polly-invariant-load-hoisting=true | FileCheck %s
 ;
 ; This caused an infinite recursion during invariant load hoisting at some
 ; point. Check it does not and we add a "false" runtime check.
diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll
index d41e4a87bfb65..bb878cc342af8 100644
--- a/polly/test/CodeGen/stmt_split_no_dependence.ll
+++ b/polly/test/CodeGen/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK:   store i32 %9, ptr %scevgep, align 4, !alias.scope !3, !noalias !6
 ; CHECK:   store i32 %11, ptr %scevgep4, align 4, !alias.scope !6, !noalias !3
diff --git a/polly/test/CodeGen/switch-in-non-affine-region.ll b/polly/test/CodeGen/switch-in-non-affine-region.ll
index 6696efca63f02..1a9e7081bebdc 100644
--- a/polly/test/CodeGen/switch-in-non-affine-region.ll
+++ b/polly/test/CodeGen/switch-in-non-affine-region.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
index 86395f25db1a8..b2a062363eef4 100644
--- a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
+++ b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check for the correct written value of a scalar phi write whose value is
 ; defined within the loop, but its effective value is its last definition when
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
index b5172badd76dc..5668063c27c8e 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -verify-loop-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -verify-loop-info < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/bugs/show_bug.cgi?id=21167
 ;
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll
index 39cadc78f7e36..fdc98fbb4d9e7 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/PR21167
 ;
diff --git a/polly/test/CodeGen/test.ll b/polly/test/CodeGen/test.ll
index 7c28ca4860e79..aad998ba2728b 100644
--- a/polly/test/CodeGen/test.ll
+++ b/polly/test/CodeGen/test.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;int bar1();
diff --git a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
index d97a632fc382e..1c68389eaeba8 100644
--- a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
+++ b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK:       polly.merge_new_and_old:
 ; CHECK-NEXT:    merge = phi
diff --git a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
index 845d106d43b0e..4396c38310dce 100644
--- a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
+++ b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: for.cond:
 ; CHECK:         %num.0 = phi i32 [ %add, %for.body15 ], [ 0, %for.cond.pre_entry_bb ]
diff --git a/polly/test/CodeGen/two-scops-in-row.ll b/polly/test/CodeGen/two-scops-in-row.ll
index 4b9d49cb02ec6..dd3f310ef1502 100644
--- a/polly/test/CodeGen/two-scops-in-row.ll
+++ b/polly/test/CodeGen/two-scops-in-row.ll
@@ -1,6 +1,6 @@
 
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-ignore-aliasing -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -disable-output < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; SCALAR: if (
diff --git a/polly/test/CodeGen/udiv_expansion_position.ll b/polly/test/CodeGen/udiv_expansion_position.ll
index 2a3ba8ae45757..354e3cd180107 100644
--- a/polly/test/CodeGen/udiv_expansion_position.ll
+++ b/polly/test/CodeGen/udiv_expansion_position.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify we do not crash when we synthezise code for the udiv in the SCoP.
 ;
diff --git a/polly/test/CodeGen/uninitialized_scalar_memory.ll b/polly/test/CodeGen/uninitialized_scalar_memory.ll
index ad0e6ca7e350b..e08af07e604e8 100644
--- a/polly/test/CodeGen/uninitialized_scalar_memory.ll
+++ b/polly/test/CodeGen/uninitialized_scalar_memory.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Verify we initialize the scalar locations reserved for the incoming phi
 ; values.
diff --git a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
index e7f4d601edab5..46706804a81b0 100644
--- a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
+++ b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; The loop for.body is a scop with invariant load hoisting, but does not
 ; terminate predictably for ScalarEvolution. The scalar %1 therefore is not
diff --git a/polly/test/CodeGen/variant_load_empty_domain.ll b/polly/test/CodeGen/variant_load_empty_domain.ll
index d1f4450d086e0..6f2d3dc582db3 100644
--- a/polly/test/CodeGen/variant_load_empty_domain.ll
+++ b/polly/test/CodeGen/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
index 44f6dbcd34d1d..b342b1cb5aa27 100644
--- a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
+++ b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 ;    int /* pure */ g()
diff --git a/polly/test/DeLICM/confused_order.ll b/polly/test/DeLICM/confused_order.ll
index de340ef48d16e..0c19eb6aa605a 100644
--- a/polly/test/DeLICM/confused_order.ll
+++ b/polly/test/DeLICM/confused_order.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-delicm>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-delicm' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
 ;
 ; ForwardOptree changes the SCoP and may already map some accesses.
 ; DeLICM must be prepared to encounter implicit reads
diff --git a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
index ba42692febab2..66d9ae889e657 100644
--- a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
+++ b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; The domain of bb14 contradicts the SCoP's assumptions. This leads to
 ; 'anything goes' inside the statement since it is never executed,
diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll
index 19cc334f70054..a78a4691bb0d5 100644
--- a/polly/test/DeLICM/load-in-cond-inf-loop.ll
+++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 
 ; When %b is 0, %for.body13 is an infinite loop. In this case the loaded
 ; value %1 is not used anywhere.
diff --git a/polly/test/DeLICM/map_memset_zero.ll b/polly/test/DeLICM/map_memset_zero.ll
index cc4e0ab387d2a..9a8e5989fdad1 100644
--- a/polly/test/DeLICM/map_memset_zero.ll
+++ b/polly/test/DeLICM/map_memset_zero.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Check that PHI mapping works even in presence of a memset whose'
 ; zero value is used.
diff --git a/polly/test/DeLICM/nomap_alreadymapped.ll b/polly/test/DeLICM/nomap_alreadymapped.ll
index 9e49300381b57..da5f4ec24a47e 100644
--- a/polly/test/DeLICM/nomap_alreadymapped.ll
+++ b/polly/test/DeLICM/nomap_alreadymapped.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_escaping.ll b/polly/test/DeLICM/nomap_escaping.ll
index 6460dbdb808fb..60955368fe59c 100644
--- a/polly/test/DeLICM/nomap_escaping.ll
+++ b/polly/test/DeLICM/nomap_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_occupied.ll b/polly/test/DeLICM/nomap_occupied.ll
index 72eea57b8fdf5..9ba8ce2641231 100644
--- a/polly/test/DeLICM/nomap_occupied.ll
+++ b/polly/test/DeLICM/nomap_occupied.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_readonly.ll b/polly/test/DeLICM/nomap_readonly.ll
index 67bac06f1505f..7a185d336bad3 100644
--- a/polly/test/DeLICM/nomap_readonly.ll
+++ b/polly/test/DeLICM/nomap_readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      fsomeval = 21.0 + 21.0;
diff --git a/polly/test/DeLICM/nomap_spuriouswrite.ll b/polly/test/DeLICM/nomap_spuriouswrite.ll
index f3fcb0ccd06e4..0ed7f6ee8e239 100644
--- a/polly/test/DeLICM/nomap_spuriouswrite.ll
+++ b/polly/test/DeLICM/nomap_spuriouswrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_storagesize.ll b/polly/test/DeLICM/nomap_storagesize.ll
index 0f2943a5b1417..bf851ac342d20 100644
--- a/polly/test/DeLICM/nomap_storagesize.ll
+++ b/polly/test/DeLICM/nomap_storagesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(float *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_writewrite.ll b/polly/test/DeLICM/nomap_writewrite.ll
index fc8459a34972c..9fcd52aad743c 100644
--- a/polly/test/DeLICM/nomap_writewrite.ll
+++ b/polly/test/DeLICM/nomap_writewrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/outofquota-reverseDomain.ll b/polly/test/DeLICM/outofquota-reverseDomain.ll
index d48665bdc29c1..1f7527c841208 100644
--- a/polly/test/DeLICM/outofquota-reverseDomain.ll
+++ b/polly/test/DeLICM/outofquota-reverseDomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; This causes an assertion to fail on out-of-quota after 1000000 operations.
 ; (The error was specific to -polly-delicm-max-ops=1000000 and changes
diff --git a/polly/test/DeLICM/pass_existence.ll b/polly/test/DeLICM/pass_existence.ll
index d784655db60f3..64302d9983261 100644
--- a/polly/test/DeLICM/pass_existence.ll
+++ b/polly/test/DeLICM/pass_existence.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -disable-output < %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-delicm -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=scop(print<polly-delicm>)' -disable-output < %s | FileCheck %s
 ;
 ; Simple test for the existence of the DeLICM pass.
 ;
diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll
index 82799e4fd1ab8..2a92503809a24 100644
--- a/polly/test/DeLICM/pr41656.ll
+++ b/polly/test/DeLICM/pr41656.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR41656
 ;
diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll
index 10f8b64c3dd2f..deba8bfcc5daf 100644
--- a/polly/test/DeLICM/pr48783.ll
+++ b/polly/test/DeLICM/pr48783.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-scops -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR48783
 ;
diff --git a/polly/test/DeLICM/reduction.ll b/polly/test/DeLICM/reduction.ll
index 5d6531f51d570..29b7a3617300b 100644
--- a/polly/test/DeLICM/reduction.ll
+++ b/polly/test/DeLICM/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_constant_selfconflict.ll b/polly/test/DeLICM/reduction_constant_selfconflict.ll
index 223a429d76343..012e0a0794b2b 100644
--- a/polly/test/DeLICM/reduction_constant_selfconflict.ll
+++ b/polly/test/DeLICM/reduction_constant_selfconflict.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate.ll b/polly/test/DeLICM/reduction_looprotate.ll
index b8eefe5e57cf8..341cc091f7e18 100644
--- a/polly/test/DeLICM/reduction_looprotate.ll
+++ b/polly/test/DeLICM/reduction_looprotate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll
index 627a4452c3f90..a58eabb4fbd82 100644
--- a/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll
+++ b/polly/test/DeLICM/reduction_looprotate_alwaystaken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Verify that delicm can cope with never taken PHI incoming edges.
 ; The edge %body -> %body_phi is never taken, hence the access MemoryKind::PHI,
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll
index 1d3a789f7ce07..5a81441cf0eea 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-partial-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck -check-prefix=PARTIAL %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
index 37499cd73020f..d9c5268e631df 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction only over some iterations.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
index 79a700ff122e2..6a4223f5af655 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction not written in all iterations.
 ; FIXME: %join is not mapped because the MemoryKind::Value mapping does not
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
index 7e82daa9f80fc..bf4b8018d5526 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Hosted reduction load (but not the store) without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_hoisted.ll b/polly/test/DeLICM/reduction_looprotate_hoisted.ll
index 7dc6e0fa9e408..795b94912aa42 100644
--- a/polly/test/DeLICM/reduction_looprotate_hoisted.ll
+++ b/polly/test/DeLICM/reduction_looprotate_hoisted.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-invariant-load-hoisting -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A, int* StartPtr) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_licm.ll b/polly/test/DeLICM/reduction_looprotate_licm.ll
index a9c55a8f54087..935f31abced30 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_licm2.ll b/polly/test/DeLICM/reduction_looprotate_licm2.ll
index b98950b71bc85..8b06e7466f20a 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ; Use %phi instead of the normal %add; that is, the last last iteration will
 ; be ignored such the %phi cannot be written to A[3] in %body.
diff --git a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll
index 4424d904b607d..51bb7291a73ed 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm_double_write.ll
@@ -1,4 +1,7 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule \
+; RUN: -polly-delicm-overapproximate-writes=true \
+; RUN: -polly-delicm-compute-known=true -polly-print-delicm \
+; RUN: -disable-output < %s | FileCheck %s
 ;
 ; Make sure delicm works even in case two stores that store the same value.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
index 7d20b8d5c7cbf..027df44e86193 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; Register-promoted reduction but without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_load.ll b/polly/test/DeLICM/reduction_looprotate_load.ll
index e288a86f30719..6aa83ae195031 100644
--- a/polly/test/DeLICM/reduction_looprotate_load.ll
+++ b/polly/test/DeLICM/reduction_looprotate_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A, double* StartPtr) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
index 4582f0a36eb5c..4ea3fa53a339a 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all. Load hoisted before loop.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
index 7df2885e01339..2e7abe444ad65 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
index a1bd5d3f90fe7..60afdeb5fc97e 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not written to.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
index 8329a85ecf13b..e63b457de92db 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not accessed.
diff --git a/polly/test/DeLICM/reduction_looprotate_readonly.ll b/polly/test/DeLICM/reduction_looprotate_readonly.ll
index 5227f42ae4824..a9535467b3bde 100644
--- a/polly/test/DeLICM/reduction_looprotate_readonly.ll
+++ b/polly/test/DeLICM/reduction_looprotate_readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A, double Start) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll
index 77d823c8ef6d5..3d486910c8612 100644
--- a/polly/test/DeLICM/reduction_looprotate_synthesizable.ll
+++ b/polly/test/DeLICM/reduction_looprotate_synthesizable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_undef.ll b/polly/test/DeLICM/reduction_looprotate_undef.ll
index f70df6075c2d3..8c0544ed77852 100644
--- a/polly/test/DeLICM/reduction_looprotate_undef.ll
+++ b/polly/test/DeLICM/reduction_looprotate_undef.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-overapproximate-writes=true -polly-delicm-compute-known=true -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(int *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_overapproximate.ll b/polly/test/DeLICM/reduction_overapproximate.ll
index d6cbb70a84a4a..2d33d3a0ece2a 100644
--- a/polly/test/DeLICM/reduction_overapproximate.ll
+++ b/polly/test/DeLICM/reduction_overapproximate.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<flatten;delicm>' -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=true -polly-delicm-partial-writes=false -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=APPROX
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-overapproximate-writes=false -polly-delicm-partial-writes=false  -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=EXACT
+; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-flatten-schedule -polly-delicm-compute-known=true -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck %s --check-prefix=PARTIAL
 ;
 ;    void func(double *A {
 ;      for (int j = -1; j < 3; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_preheader.ll b/polly/test/DeLICM/reduction_preheader.ll
index f3ce58b1bc954..c6e3643797c04 100644
--- a/polly/test/DeLICM/reduction_preheader.ll
+++ b/polly/test/DeLICM/reduction_preheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten;delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-flatten-schedule -polly-print-delicm -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_unrelatedunusual.ll b/polly/test/DeLICM/reduction_unrelatedunusual.ll
index 542cec71ab855..97826f603e5d4 100644
--- a/polly/test/DeLICM/reduction_unrelatedunusual.ll
+++ b/polly/test/DeLICM/reduction_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Map %add and %phi to A[j].
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/DeLICM/reject_loadafterstore.ll b/polly/test/DeLICM/reject_loadafterstore.ll
index d56b237aa71d9..4460620852a85 100644
--- a/polly/test/DeLICM/reject_loadafterstore.ll
+++ b/polly/test/DeLICM/reject_loadafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll
index 9b7f8e5f97af3..9bc6bf1f23733 100644
--- a/polly/test/DeLICM/reject_outofquota.ll
+++ b/polly/test/DeLICM/reject_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps;delicm>' -polly-print-deps -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,print<polly-dependences>' -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeafterstore.ll b/polly/test/DeLICM/reject_storeafterstore.ll
index 0fea4d7bb3960..ddd13dad2ed31 100644
--- a/polly/test/DeLICM/reject_storeafterstore.ll
+++ b/polly/test/DeLICM/reject_storeafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeinsubregion.ll b/polly/test/DeLICM/reject_storeinsubregion.ll
index 0b75c16495c5c..c987156b51cd1 100644
--- a/polly/test/DeLICM/reject_storeinsubregion.ll
+++ b/polly/test/DeLICM/reject_storeinsubregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_unusualstore.ll b/polly/test/DeLICM/reject_unusualstore.ll
index 311a7351c955b..342888c6654f4 100644
--- a/polly/test/DeLICM/reject_unusualstore.ll
+++ b/polly/test/DeLICM/reject_unusualstore.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<delicm>' -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ;    void func(double *A) {
diff --git a/polly/test/DeLICM/skip_maywrite.ll b/polly/test/DeLICM/skip_maywrite.ll
index 14de2b9d0bf84..0d30791cd94e7 100644
--- a/polly/test/DeLICM/skip_maywrite.ll
+++ b/polly/test/DeLICM/skip_maywrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/skip_multiaccess.ll b/polly/test/DeLICM/skip_multiaccess.ll
index a213a91343f3d..a7c79f7524630 100644
--- a/polly/test/DeLICM/skip_multiaccess.ll
+++ b/polly/test/DeLICM/skip_multiaccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR34485
 ; llvm.org/PR34989
diff --git a/polly/test/DeLICM/skip_notinloop.ll b/polly/test/DeLICM/skip_notinloop.ll
index 3a2dede210083..8e265e19aefea 100644
--- a/polly/test/DeLICM/skip_notinloop.ll
+++ b/polly/test/DeLICM/skip_notinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      double phi = 0.0;
diff --git a/polly/test/DeLICM/skip_scalaraccess.ll b/polly/test/DeLICM/skip_scalaraccess.ll
index a0ed9f76a8ca2..2cf13afe11cdf 100644
--- a/polly/test/DeLICM/skip_scalaraccess.ll
+++ b/polly/test/DeLICM/skip_scalaraccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeadCodeElimination/chained_iterations.ll b/polly/test/DeadCodeElimination/chained_iterations.ll
index f1e47075e2f74..f3bf07bb40d83 100644
--- a/polly/test/DeadCodeElimination/chained_iterations.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/chained_iterations_2.ll b/polly/test/DeadCodeElimination/chained_iterations_2.ll
index 6ecc07c0f7d21..52f034f0e56ca 100644
--- a/polly/test/DeadCodeElimination/chained_iterations_2.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/computeout.ll b/polly/test/DeadCodeElimination/computeout.ll
index b43142be2a5c8..e54df42ed1db0 100644
--- a/polly/test/DeadCodeElimination/computeout.ll
+++ b/polly/test/DeadCodeElimination/computeout.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
index 85eea91f99207..c102f60abb659 100644
--- a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
+++ b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<dce;ast>' -polly-print-ast -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
index 21b7c5cf9583b..36f55476fed23 100644
--- a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
+++ b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/DeadCodeElimination/non-affine.ll b/polly/test/DeadCodeElimination/non-affine.ll
index 86cabe6501393..ef528b4124c66 100644
--- a/polly/test/DeadCodeElimination/non-affine.ll
+++ b/polly/test/DeadCodeElimination/non-affine.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ;
diff --git a/polly/test/DeadCodeElimination/null_schedule.ll b/polly/test/DeadCodeElimination/null_schedule.ll
index 507d690144e01..01d34e95629ba 100644
--- a/polly/test/DeadCodeElimination/null_schedule.ll
+++ b/polly/test/DeadCodeElimination/null_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-custom<dce;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; A[0] = 1;
 ;
diff --git a/polly/test/DependenceInfo/computeout.ll b/polly/test/DependenceInfo/computeout.ll
index 3fdc4008f8474..c2a3456b3dc80 100644
--- a/polly/test/DependenceInfo/computeout.ll
+++ b/polly/test/DependenceInfo/computeout.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/different_schedule_dimensions.ll b/polly/test/DependenceInfo/different_schedule_dimensions.ll
index 69274f11f567f..f89791f42f9db 100644
--- a/polly/test/DependenceInfo/different_schedule_dimensions.ll
+++ b/polly/test/DependenceInfo/different_schedule_dimensions.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' \
+; RUN:                   -disable-output < %s | FileCheck %s
 
 ; CHECK: RAW dependences:
 ; CHECK:   { Stmt_bb9[0] -> Stmt_bb10[0] }
diff --git a/polly/test/DependenceInfo/do_pluto_matmult.ll b/polly/test/DependenceInfo/do_pluto_matmult.ll
index 2a0027bbc034b..b88cf9bf5475c 100644
--- a/polly/test/DependenceInfo/do_pluto_matmult.ll
+++ b/polly/test/DependenceInfo/do_pluto_matmult.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll
index 06a196822c832..5abbf48136891 100644
--- a/polly/test/DependenceInfo/fine_grain_dep_0.ll
+++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
 
 ; REF:      RAW dependences:
 ; REF-NEXT:     [N] -> { [Stmt_for_body[i0] -> MemRef_b[]] -> [Stmt_for_body[6 + i0] -> MemRef_b[]] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[6 + i0] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[4 + i0] : 0 <= i0 <= -11 + N; [Stmt_for_body[i0] -> MemRef_a[]] -> [Stmt_for_body[4 + i0] -> MemRef_a[]] : 0 <= i0 <= -11 + N }
diff --git a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
index 9875257694331..677323495476b 100644
--- a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
+++ b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;                     for (int i = 0; i < N; i++) {
diff --git a/polly/test/DependenceInfo/infeasible_context.ll b/polly/test/DependenceInfo/infeasible_context.ll
index c9473e614e362..cde3102dc3dc9 100644
--- a/polly/test/DependenceInfo/infeasible_context.ll
+++ b/polly/test/DependenceInfo/infeasible_context.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-SCOP
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-detect -polly-print-deps -disable-output < %s 2>&1 | FileCheck %s -check-prefix=FUNC-DEPS
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  | FileCheck %s -check-prefix=FUNC-SCOP
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(print<polly-dependences>)' -disable-output < %s 2>&1 \
+; RUN:  | FileCheck %s -check-prefix=FUNC-DEPS
 ;
 ; FUNC-SCOP-NOT: Statement
 ; FUNC-DEPS-NOT: RAW dependences
diff --git a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
index 92e6cb89b2a27..392a34769cddb 100644
--- a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
+++ b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; Verify that the presence of a may-write (S1) between a read (S0) and a
 ; must-write (S2) does not block the generation of RAW dependences. This makes
diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
index b14759725dde0..ae5fd3beed399 100644
--- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
+++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK:        MayWriteAccess :=   [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/DependenceInfo/reduction_complex_location.ll b/polly/test/DependenceInfo/reduction_complex_location.ll
index 45789088e57e4..7722ee974c3fa 100644
--- a/polly/test/DependenceInfo/reduction_complex_location.ll
+++ b/polly/test/DependenceInfo/reduction_complex_location.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 7923975118bb9..840d1f32dca39 100644
--- a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we keep the dependences of A.
diff --git a/polly/test/DependenceInfo/reduction_dependences_not_null.ll b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
index fdcd5f311800d..56d84a9aec6d6 100644
--- a/polly/test/DependenceInfo/reduction_dependences_not_null.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
@@ -1,7 +1,7 @@
 ; Test that the reduction dependences are always initialised, even in a case
 ; where we have no reduction. If this object is NULL, then isl operations on
 ; it will fail.
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/reduction_indirect_access.ll b/polly/test/DependenceInfo/reduction_indirect_access.ll
index 13675ada39b0e..3b4bd9ef04b5a 100644
--- a/polly/test/DependenceInfo/reduction_indirect_access.ll
+++ b/polly/test/DependenceInfo/reduction_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ; CHECK: Reduction dependences:
 ; CHECK:   [N] -> { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : 0 <= i0 <= -2 + N }
diff --git a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
index e6ce425719ca9..76c7fc64ae89c 100644
--- a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0 + i1, o1] : i0 >= 0 and 0 <= i1 <= 1023 - i0 and i1 <= 1 and 0 < o1 <= 511 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
index 820371937a582..02b814a0d7c04 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
@@ -1,6 +1,6 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the inner reduction like accesses cause reduction dependences
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
index 9792f791c6989..91bd35deebd06 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
index 9bde285c64516..040d513782392 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     { Stmt_for_inc[i0, i1] -> Stmt_for_inc[i0, 1 + i1] : 0 <= i0 <= 99 and 0 <= i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions.ll b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
index ac3adb9065462..527a8cfc3556e 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not have dependences between the if and the else clause
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
index 16ca85bff9502..fb5fd96a2e426 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ;
 ; These are the important RAW dependences, as they need to originate/end in only one iteration:
diff --git a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
index de506a39485cc..3ec3920268b49 100644
--- a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; FIXME: Change the comment once we allow different pointers
 ; The statement is "almost" reduction like but should not yield any reduction dependences
diff --git a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
index fbf1409a1ba30..23bd8ef25bd7a 100644
--- a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
+++ b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0, 1 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and 1024 - N + i0 <= i1 <= 1022 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps.ll b/polly/test/DependenceInfo/reduction_privatization_deps.ll
index 0d66f885cd42d..0e0f71737ffd3 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[-1 + i0 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and -i0 < i1 <= 1024 - i0 and i1 <= 1023; Stmt_S0[i0] -> Stmt_S1[o0, i0 - o0] : i0 <= 1023 and 0 <= o0 <= i0 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
index 81235d6cf02e4..cafa319e2cc7b 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; We have privatization dependences from a textually later statement to a
 ; textually earlier one, but the dependences still go forward in time.
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
index 6b48ab5afd155..d86da92fbcab8 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S3[2 + i0] : 0 <= i0 <= 96; Stmt_S2[i0, i1] -> Stmt_S3[o0] : i1 <= 1 - i0 and -i1 < o0 <= 1 and o0 <= 1 + i0 - i1; Stmt_S3[i0] -> Stmt_S2[o0, 1 - i0] : 0 <= i0 <= 1 and i0 < o0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
index 1fef004c4c47a..d84c04fc309b0 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S2[i0, i0] : 0 <= i0 <= 98; Stmt_S2[i0, i0] -> Stmt_S3[i0] : 0 <= i0 <= 98; Stmt_S3[i0] -> Stmt_S2[o0, i0] : i0 >= 0 and i0 < o0 <= 98; Stmt_S2[i0, i1] -> Stmt_S1[i1] : i0 >= 0 and i0 < i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
index f40a7c07a3ba4..592c7238c3c59 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, 0] -> Stmt_S2[i0, 0] : 0 <= i0 <= 98; Stmt_S2[i0, 0] -> Stmt_S1[1 + i0, 0] : 0 <= i0 <= 97 }
diff --git a/polly/test/DependenceInfo/reduction_sequence.ll b/polly/test/DependenceInfo/reduction_sequence.ll
index d881a99adc226..7ce9d37d395bb 100644
--- a/polly/test/DependenceInfo/reduction_sequence.ll
+++ b/polly/test/DependenceInfo/reduction_sequence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 
 ;    void manyreductions(long *A) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/DependenceInfo/reduction_simple_iv.ll b/polly/test/DependenceInfo/reduction_simple_iv.ll
index b811d1593ab02..d13d14ecaad92 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
index 0a5d36f9b9f79..4c97fbb1aacb7 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
index 90f9d76ef57b2..804005cf72a72 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[i0] : 0 <= i0 <= 99 and 0 <= i1 <= 99; Stmt_S0[i0] -> Stmt_S1[i0, o1] : 0 <= i0 <= 99 and 0 <= o1 <= 99; Stmt_S2[i0] -> Stmt_S0[1 + i0] : 0 <= i0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
index 2b194bbb51988..9596827b4cbbf 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_S1[i0] -> Stmt_S2[] : N >= 11 and 0 <= i0 <= 1023; Stmt_S0[] -> Stmt_S1[o0] : N >= 11 and 0 <= o0 <= 1023 }
diff --git a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
index 70d5bdf64059d..d67683d11a4b3 100644
--- a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
+++ b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/sequential_loops.ll b/polly/test/DependenceInfo/sequential_loops.ll
index 023c2d4f29f37..6ae7200303321 100644
--- a/polly/test/DependenceInfo/sequential_loops.ll
+++ b/polly/test/DependenceInfo/sequential_loops.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<deps>' -polly-print-deps -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
 
 ; VALUE:      RAW dependences:
 ; VALUE-NEXT:          {  }
diff --git a/polly/test/FlattenSchedule/gemm.ll b/polly/test/FlattenSchedule/gemm.ll
index 11dc40599bb0e..b20293bd315a3 100644
--- a/polly/test/FlattenSchedule/gemm.ll
+++ b/polly/test/FlattenSchedule/gemm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<flatten>' -polly-print-flatten-schedule -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-flatten-schedule -disable-output < %s | FileCheck %s
 ;
 ; dgemm kernel
 ; C := alpha*A*B + beta*C
diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll
index 3dfe3fa0aa8e6..6c81fb12e8cdc 100644
--- a/polly/test/ForwardOpTree/atax.ll
+++ b/polly/test/ForwardOpTree/atax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/changed-kind.ll b/polly/test/ForwardOpTree/changed-kind.ll
index ec8869da3ae57..b9081f3734044 100644
--- a/polly/test/ForwardOpTree/changed-kind.ll
+++ b/polly/test/ForwardOpTree/changed-kind.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 ; In the code below, %0 is known to be equal to the content of @c (constant 0).
 ; Thus, in order to save a scalar dependency, forward-optree replaces
diff --git a/polly/test/ForwardOpTree/forward_from_region.ll b/polly/test/ForwardOpTree/forward_from_region.ll
index de47bc4df0076..767a580dccf95 100644
--- a/polly/test/ForwardOpTree/forward_from_region.ll
+++ b/polly/test/ForwardOpTree/forward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions from region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_hoisted.ll b/polly/test/ForwardOpTree/forward_hoisted.ll
index 39f99545b01ac..5d0b0a884b761 100644
--- a/polly/test/ForwardOpTree/forward_hoisted.ll
+++ b/polly/test/ForwardOpTree/forward_hoisted.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify).
 ; This involves making the load-hoisted %val1 to be made available in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_instruction.ll b/polly/test/ForwardOpTree/forward_instruction.ll
index a9f5d3d85ac0a..50a9b07b8a05b 100644
--- a/polly/test/ForwardOpTree/forward_instruction.ll
+++ b/polly/test/ForwardOpTree/forward_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region.ll b/polly/test/ForwardOpTree/forward_into_region.ll
index 2279a89cfaeb7..ef71b11dc5716 100644
--- a/polly/test/ForwardOpTree/forward_into_region.ll
+++ b/polly/test/ForwardOpTree/forward_into_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions to region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
index f7901e1ccf8fd..1c585446ae63a 100644
--- a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
+++ b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 
 define void @foo(ptr %A, i32 %p, ptr %B) {
diff --git a/polly/test/ForwardOpTree/forward_load.ll b/polly/test/ForwardOpTree/forward_load.ll
index 860e603ef47d2..0bba41833fb19 100644
--- a/polly/test/ForwardOpTree/forward_load.ll
+++ b/polly/test/ForwardOpTree/forward_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-optree>)" -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_differentarray.ll b/polly/test/ForwardOpTree/forward_load_differentarray.ll
index 24b008cfae384..364bf3ef37133 100644
--- a/polly/test/ForwardOpTree/forward_load_differentarray.ll
+++ b/polly/test/ForwardOpTree/forward_load_differentarray.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; To forward %val, B[j] cannot be reused in bodyC because it is overwritten
 ; between. Verify that instead the alternative C[j] is used.
diff --git a/polly/test/ForwardOpTree/forward_load_double_write.ll b/polly/test/ForwardOpTree/forward_load_double_write.ll
index 522e803b2d0a0..4c30c7f8da56f 100644
--- a/polly/test/ForwardOpTree/forward_load_double_write.ll
+++ b/polly/test/ForwardOpTree/forward_load_double_write.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load even in case two writes of identical values are in
 ; one scop statement.
diff --git a/polly/test/ForwardOpTree/forward_load_fromloop.ll b/polly/test/ForwardOpTree/forward_load_fromloop.ll
index 5c64221d882b9..1494e872a8942 100644
--- a/polly/test/ForwardOpTree/forward_load_fromloop.ll
+++ b/polly/test/ForwardOpTree/forward_load_fromloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward a the LoadInst %val into %bodyB. %val is executed multiple times,
 ; we must get the last loaded values.
diff --git a/polly/test/ForwardOpTree/forward_load_indirect.ll b/polly/test/ForwardOpTree/forward_load_indirect.ll
index 5b06c357f02ba..51ce94d267277 100644
--- a/polly/test/ForwardOpTree/forward_load_indirect.ll
+++ b/polly/test/ForwardOpTree/forward_load_indirect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward an operand tree consisting of a speculatable instruction (%add)
 ; and a load (%val).
diff --git a/polly/test/ForwardOpTree/forward_load_memset_after.ll b/polly/test/ForwardOpTree/forward_load_memset_after.ll
index b889783d531e6..bd2cad411eccf 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_after.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_after.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_memset_before.ll b/polly/test/ForwardOpTree/forward_load_memset_before.ll
index c8f0e0e5814fb..3e89dea37775c 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_before.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_before.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_tripleuse.ll b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
index df57bf70cc53b..7526a8313945d 100644
--- a/polly/test/ForwardOpTree/forward_load_tripleuse.ll
+++ b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>,polly-codegen' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; %val1 is used three times: Twice by its own operand tree of %val2 and once
 ; more by the store in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
index ba84a1a16748f..daf289d8b0da1 100644
--- a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
+++ b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/ForwardOpTree/forward_phi_load.ll b/polly/test/ForwardOpTree/forward_phi_load.ll
index c763af4269c89..1457aa96e2de7 100644
--- a/polly/test/ForwardOpTree/forward_phi_load.ll
+++ b/polly/test/ForwardOpTree/forward_phi_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_readonly.ll b/polly/test/ForwardOpTree/forward_readonly.ll
index 69c7f10be4e56..646121c4efeff 100644
--- a/polly/test/ForwardOpTree/forward_readonly.ll
+++ b/polly/test/ForwardOpTree/forward_readonly.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
-; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_reusue.ll b/polly/test/ForwardOpTree/forward_reusue.ll
index e39e7b51dc689..d8ad31782ecb9 100644
--- a/polly/test/ForwardOpTree/forward_reusue.ll
+++ b/polly/test/ForwardOpTree/forward_reusue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move operand tree without duplicating values used multiple times.
 ;
diff --git a/polly/test/ForwardOpTree/forward_store.ll b/polly/test/ForwardOpTree/forward_store.ll
index 8cd6e2446ff93..17cb8b395eb30 100644
--- a/polly/test/ForwardOpTree/forward_store.ll
+++ b/polly/test/ForwardOpTree/forward_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
index f70965f3c5d1b..57b68180bb121 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Copy %val to bodyB, assuming the exit value of %i.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
index c95c45856ac36..b4828e4c2c423 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test support for (synthesizable) inducation variables.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
index 14fb8d8dcc0ab..3228bb60d2ca2 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Synthesizable values defined outside of a loop can be used
 ; inside the loop.
diff --git a/polly/test/ForwardOpTree/forward_transitive.ll b/polly/test/ForwardOpTree/forward_transitive.ll
index 7b55d9e0cf9b2..aacf1358648f5 100644
--- a/polly/test/ForwardOpTree/forward_transitive.ll
+++ b/polly/test/ForwardOpTree/forward_transitive.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %v and %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll
index 3bc504d88c0eb..cb035bb749c7b 100644
--- a/polly/test/ForwardOpTree/jacobi-1d.ll
+++ b/polly/test/ForwardOpTree/jacobi-1d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/noforward_from_region.ll b/polly/test/ForwardOpTree/noforward_from_region.ll
index 0729241c3f7d9..bd5864c25f543 100644
--- a/polly/test/ForwardOpTree/noforward_from_region.ll
+++ b/polly/test/ForwardOpTree/noforward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Ensure we do not move instructions from region statements in case the
 ; instruction to move loads from an array which is also written to from
diff --git a/polly/test/ForwardOpTree/noforward_load_conditional.ll b/polly/test/ForwardOpTree/noforward_load_conditional.ll
index d33ef99ae6bed..5474e740de800 100644
--- a/polly/test/ForwardOpTree/noforward_load_conditional.ll
+++ b/polly/test/ForwardOpTree/noforward_load_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; B[j] is overwritten by at least one statement between the
 ; definition of %val and its use. Hence, it cannot be forwarded.
diff --git a/polly/test/ForwardOpTree/noforward_load_writebetween.ll b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
index e7deb381de87a..697c940be4fdd 100644
--- a/polly/test/ForwardOpTree/noforward_load_writebetween.ll
+++ b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Cannot rematerialize %val from B[0] at bodyC because B[0] has been
 ; overwritten in bodyB.
diff --git a/polly/test/ForwardOpTree/noforward_outofquota.ll b/polly/test/ForwardOpTree/noforward_outofquota.ll
index 5e30cf88de4cf..306bb8d7558d1 100644
--- a/polly/test/ForwardOpTree/noforward_outofquota.ll
+++ b/polly/test/ForwardOpTree/noforward_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
-; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=polly-custom<optree>' -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 -passes=polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ; for (int j = 0; j < n; j += 1) {
diff --git a/polly/test/ForwardOpTree/noforward_partial.ll b/polly/test/ForwardOpTree/noforward_partial.ll
index f95bb77f70b67..edb5d34801cc5 100644
--- a/polly/test/ForwardOpTree/noforward_partial.ll
+++ b/polly/test/ForwardOpTree/noforward_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Not the entire operand tree can be forwarded,
 ; some scalar dependencies would remain.
diff --git a/polly/test/ForwardOpTree/noforward_phi.ll b/polly/test/ForwardOpTree/noforward_phi.ll
index 025fe64724151..755abad4336ef 100644
--- a/polly/test/ForwardOpTree/noforward_phi.ll
+++ b/polly/test/ForwardOpTree/noforward_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not move PHI nodes.
 ;
diff --git a/polly/test/ForwardOpTree/noforward_selfrefphi.ll b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
index 8b30137858243..be7e82f726331 100644
--- a/polly/test/ForwardOpTree/noforward_selfrefphi.ll
+++ b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Contains a self-referencing PHINode that would require a
 ; transitive closure to handle.
diff --git a/polly/test/ForwardOpTree/noforward_sideffects.ll b/polly/test/ForwardOpTree/noforward_sideffects.ll
index 179b02a259025..c01b72a1c1420 100644
--- a/polly/test/ForwardOpTree/noforward_sideffects.ll
+++ b/polly/test/ForwardOpTree/noforward_sideffects.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not forward instructions with side-effects (here: function call).
 ;
diff --git a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
index 6baec6d9e1c6c..776d848072a23 100644
--- a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
+++ b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not try to forward %i.trunc, it is not synthesizable in %body.
 ;
diff --git a/polly/test/ForwardOpTree/out-of-quota1.ll b/polly/test/ForwardOpTree/out-of-quota1.ll
index 95df49a5c061a..ee3e32698dd02 100644
--- a/polly/test/ForwardOpTree/out-of-quota1.ll
+++ b/polly/test/ForwardOpTree/out-of-quota1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree>' -polly-print-optree -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output %s | FileCheck %s
 
 ; This used to loop infinitely because of UINT_MAX returned by ISL on out-of-quota.
 
diff --git a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
index a5102b3557f0c..ec1ccdce94508 100644
--- a/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/multiple_loops_outer_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 ;
 ;       void jd(int *A) {
 ; CHECK:  #pragma omp parallel for
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
index d086b59f97a5a..9c00690605408 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
index 49a6b0531de56..356762a2ae5b9 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_both_parallel_parametric.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; int A[1024][1024];
 ; void bar(int n) {
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
index d2d7917b08528..066fc39def6ac 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_inner_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
index c03189a211256..77dd55cb7605e 100644
--- a/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/nested_loop_outer_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
index 6829211cc76b9..b61ebc9379b7f 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_non_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
index 7199a337d8a4f..5c92a91681867 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-parallel-force -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
index 41d35bfdb3631..352d879199675 100644
--- a/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
+++ b/polly/test/IstAstInfo/OpenMP/single_loop_param_parallel_computeout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-parallel -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; for (i = 0; i < n; i++)
diff --git a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
index 356269cefad36..81c29536010b6 100644
--- a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
+++ b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
+; RUN:     | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/IstAstInfo/alias_simple_1.ll b/polly/test/IstAstInfo/alias_simple_1.ll
index 039c5f74fabfe..904f55dc32ce4 100644
--- a/polly/test/IstAstInfo/alias_simple_1.ll
+++ b/polly/test/IstAstInfo/alias_simple_1.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_2.ll b/polly/test/IstAstInfo/alias_simple_2.ll
index 1783a04f02be9..5fae579995b23 100644
--- a/polly/test/IstAstInfo/alias_simple_2.ll
+++ b/polly/test/IstAstInfo/alias_simple_2.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 ;
 ;    int A[1024], B[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_3.ll b/polly/test/IstAstInfo/alias_simple_3.ll
index 8d507fb82cb2d..8599c29934744 100644
--- a/polly/test/IstAstInfo/alias_simple_3.ll
+++ b/polly/test/IstAstInfo/alias_simple_3.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;    float B[1024];
diff --git a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
index 01b5372917358..dc21dc1f96a48 100644
--- a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
+++ b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output -polly-invariant-load-hoisting < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
+; RUN:   -polly-invariant-load-hoisting \
+; RUN:   | FileCheck %s
 
 ; CHECK: if (1 && 1 && (&MemRef_X[1] <= &MemRef_BaseA[0] || &MemRef_BaseA[1024] <= &MemRef_X[0]) && (&MemRef_X[1] <= &MemRef_BaseB[0] || &MemRef_BaseB[1024] <= &MemRef_X[0]))
 
diff --git a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
index 3835c23fecddb..8d4adfa405f07 100644
--- a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=      -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
index 71bac9a2bb141..be37b27b6e375 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
index e5ece1f57a85e..15550583340db 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/dependence_distance_constant.ll b/polly/test/IstAstInfo/dependence_distance_constant.ll
index 43b13eef9a95b..9b7fb93f2f676 100644
--- a/polly/test/IstAstInfo/dependence_distance_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *A, int N) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_minimal.ll b/polly/test/IstAstInfo/dependence_distance_minimal.ll
index 35a503ce7eb8d..d69cc3f9fc3f8 100644
--- a/polly/test/IstAstInfo/dependence_distance_minimal.ll
+++ b/polly/test/IstAstInfo/dependence_distance_minimal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; The minimal dependence distance of the innermost loop should be 1 instead of 250.
 ; CHECK:    #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
index a7de5c4876385..bc21e9e07ad89 100644
--- a/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
+++ b/polly/test/IstAstInfo/dependence_distance_multiple_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *restrict A, int *restrict B, int N) {
 ; CHECK:   #pragma minimal dependence distance: 5
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric.ll b/polly/test/IstAstInfo/dependence_distance_parametric.ll
index fa05e4c889031..fa569a8386b86 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *A, int N, int c) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
index 73f74b3bce0b1..7f280e0c542ca 100644
--- a/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
+++ b/polly/test/IstAstInfo/dependence_distance_parametric_expr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *A, int N, int c, int v) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_varying.ll b/polly/test/IstAstInfo/dependence_distance_varying.ll
index e908954536600..d609c2f210f8d 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;         void f(int *A, int N) {
 ; CHECK:    #pragma minimal dependence distance: -(N % 2) + 2
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
index 1668fc0515441..8ed3220353c1b 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_in_outer_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-canonicalize -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *restrict A, int *restrict sum) {
 ; CHECK:   #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
index 0d0aa8bea31d8..73768e9c308a4 100644
--- a/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
+++ b/polly/test/IstAstInfo/dependence_distance_varying_multiple.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ;        void f(int *restrict A, int *restrict B, int *restrict C, int *restrict D,
 ;               int *restrict E, int N) {
diff --git a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
index 2ed94e59e8087..e2cf0bd9c0df2 100644
--- a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
+++ b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK:      {
 ; CHECK-NEXT:    if (p <= -1 || p >= 1)
diff --git a/polly/test/IstAstInfo/non_affine_access.ll b/polly/test/IstAstInfo/non_affine_access.ll
index a285a8f032f5e..98e8d2db959f8 100644
--- a/polly/test/IstAstInfo/non_affine_access.ll
+++ b/polly/test/IstAstInfo/non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ;    void non_affine_access(float A[]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
index 3fefc74efbef0..697b6ca50d444 100644
--- a/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_multidimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int P, int sum[P][M]) {
diff --git a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
index 41bd178c73c2a..c20a7d6db13c9 100644
--- a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int *sum) {
diff --git a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 5aa8a0c244423..e6092f0b068f8 100644
--- a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we do __not__ parallelize anything
diff --git a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
index 91f7c9d9601bc..14de70f9357c3 100644
--- a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
+++ b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma simd reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
 ; CHECK: #pragma known-parallel reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
diff --git a/polly/test/IstAstInfo/reduction_in_one_dimension.ll b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
index d0173bcd978ca..797115b6f8d70 100644
--- a/polly/test/IstAstInfo/reduction_in_one_dimension.ll
+++ b/polly/test/IstAstInfo/reduction_in_one_dimension.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that we won't privatize anything in the outer dimension
 ;
diff --git a/polly/test/IstAstInfo/reduction_loop_reversal.ll b/polly/test/IstAstInfo/reduction_loop_reversal.ll
index d010e26f739a6..d30119787d8e0 100644
--- a/polly/test/IstAstInfo/reduction_loop_reversal.ll
+++ b/polly/test/IstAstInfo/reduction_loop_reversal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT: #pragma simd{{\s*$}}
 ; CHECK: #pragma simd reduction
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
index 7f78badfcb93c..15fca884c2b63 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
index 42e9c3b19eb1b..44e9aa4d1e569 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel reduction
 ; CHECK:    for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
index 8bdd5299986eb..c39ffa591484d 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
index 4811069e4f399..266753555cab1 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel
 ; CHECK:    for (int c0 = 0; c0 <= 1; c0 += 1)
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
index 4f5ac24a0b005..d7f9029fd347a 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesn't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
index 472a04847ec95..f18060a2e20a8 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesn't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
index 2cc911d78234b..8e2a590c5f57c 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesn't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
index 1b2d0eb75c12c..b889db4819cd5 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the outer dimension needs privatization
 ;
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
index 884cea7918031..2a8fd7a4f670e 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
index 013a7d4f3ad27..25f2fa597e34e 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
index 2dc6d8680b36a..0d6be9a9da9bf 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
index dcd75945d25a8..8b537513cc8d7 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/run-time-condition.ll b/polly/test/IstAstInfo/run-time-condition.ll
index 67fc4b74571da..44d3534f651ce 100644
--- a/polly/test/IstAstInfo/run-time-condition.ll
+++ b/polly/test/IstAstInfo/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
index d674f429c0d48..aef509a865b6a 100644
--- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
+++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not simplify the runtime check to "true" due to the domain
 ; constraints as the test contains an error block that influenced the domains
diff --git a/polly/test/IstAstInfo/simple-run-time-condition.ll b/polly/test/IstAstInfo/simple-run-time-condition.ll
index 73a7c596cea0b..488cd180b899a 100644
--- a/polly/test/IstAstInfo/simple-run-time-condition.ll
+++ b/polly/test/IstAstInfo/simple-run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/IstAstInfo/single_loop_strip_mine.ll b/polly/test/IstAstInfo/single_loop_strip_mine.ll
index f546972fb370c..afe6179188c01 100644
--- a/polly/test/IstAstInfo/single_loop_strip_mine.ll
+++ b/polly/test/IstAstInfo/single_loop_strip_mine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
index c9ae9e8f4e52e..f614f90fc3fc9 100644
--- a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
index 45227160e8699..e91ea13278692 100644
--- a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
index 28b6a7ca12799..49a962592bb9d 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
index f19a632815795..749b962b260f5 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement from JScop file has no key name 'accesses' for index 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
index 77b9acfbb0989..1d97e3ebca625 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of memory accesses in the JSop file and the number of memory accesses differ for index 0.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
index 0a06ff671c298..f4b739398f9f6 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
index 35b7af098ae42..1f5cda3518a2f 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Memory access number 0 has no key name 'relation' for statement number 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
index 109665a85c607..0c750849b51eb 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
index f345d1c31796e..d8c9c3f4ab2ea 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file contains access function with undeclared ScopArrayInfo
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
index a66d5c8c69b55..f8d7cb8c1453e 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file changes the number of parameter dimensions.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
index ae0b4edffb5fc..6e13a5e413d76 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+  ; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has not a valid type.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
index 6c434e15a38d2..7f6578776e0bd 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; #define Ni 1056
 ; #define Nj 1056
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
index b004c4725176a..e698bdc488c2c 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'name'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
index 5f62a457f63eb..f130b6556e3e5 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'sizes'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
index 029fde10f5a4a..68d2e50c6730d 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop>' -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'type'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
index 9ac371b655146..94c77dc2a0138 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key named 'context'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
index 82afcd95c871f..c20d5c02d662e 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The isl_set is not a parameter set.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
index 0308452c6f955..92f4d61212e93 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: unexpected isl_token
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
index debb9bc604110..89668d8d573b1 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Imported context has the wrong number of parameters : Found 2 Expected 1
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
index 6eee0056ba0b5..efe15c14ce90d 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement 0 has no 'schedule' key.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
index 59feb0085e6de..db516f6d7d335 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
index 78d5243d34e00..b93c984d7d9dd 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
index 877547c8f317f..3fa14c64cd639 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadNPMPolly '-passes=polly-custom<import-jscop;ast>' -polly-print-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
index 9f999204f59bf..1d81ff7ef2dc8 100644
--- a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
+++ b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the expansion of an array with load after store in a same statement is not done.
 ;
diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll
index 1a733c113626d..57017381c661a 100644
--- a/polly/test/MaximalStaticExpansion/read_from_original.ll
+++ b/polly/test/MaximalStaticExpansion/read_from_original.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
 ;
diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll
index a7aa162aa83da..7e33de17a1749 100644
--- a/polly/test/MaximalStaticExpansion/too_many_writes.ll
+++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
index 06e08c43e3492..355fc02600d54 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
index 076f47143dbcc..930539547cc97 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array and MemoryKind::PHI.
 ; tmp_06_phi is not expanded because it need copy in.
diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll
index 2b040f3f1f4e3..a055e50225e91 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
index f863c0e1d6edf..77338c9aac200 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
index a823bdb4e7682..9cfa5536072b7 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
 ;
diff --git a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
index 0898f99c896d4..63e4d48046275 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
 ; tmp_04 is not expanded because it need copy-in.
diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
index 2a332ba7ce77b..87bd57abab8d1 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<mse>' -polly-print-mse -pass-remarks-analysis=polly-mse -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
 ; tmp_05 and tmp2_06 are not expanded because they need copy-in.
diff --git a/polly/test/MaximalStaticExpansion/working_value_expansion.ll b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
index 77f20bb163a8b..cc28a78c38671 100644
--- a/polly/test/MaximalStaticExpansion/working_value_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<mse>' -polly-print-mse -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Value
 ;
diff --git a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
index b4524c21a35ee..9cc2aecf002dd 100644
--- a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
+++ b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<prune>' -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false "-passes=scop(polly-prune-unprofitable)" -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
 ; REQUIRES: asserts
 ;
 ; Skip this SCoP for having scalar dependencies between all statements,
diff --git a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
index c8c006c94d1d4..38facb1688c46 100644
--- a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
+++ b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
 define void @sdbout_label() nounwind {
diff --git a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
index 23033faa380af..835986049899b 100644
--- a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
+++ b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Check that we handle statements with an empty iteration domain correctly.
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
index fdaed3c543673..5e4ce8225a236 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr  noalias nonnull %A,  ptr  noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
index 65d495722c2bd..de4c387a1d879 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
index 06d55f46a977f..91bd549c3c7e4 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
index 0af703ccf5ffe..8b69d9e12c0fe 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
index ca6840b900e7f..49d1124740340 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
index f96e4baba71eb..a449a2fda9ba3 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
index 229d13aaf1a4d..798e9b9a7c14f 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
index 9bc9a25ac588e..4d0ccc988a5cc 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 ; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions).
 
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
index 5b0cefbe686f6..bf470b91a7022 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
index 2225f05f6717d..b0f75dd50ef83 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Check that the disable_nonforced metadata is honored; optimization
 ; heuristics/rescheduling must not be applied.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
index 4add219214aa3..900360d7533f8 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
-; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
 ;
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
index d59f9e58e2785..d45b62433dbbc 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:2:3: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
index a1caaf5db5a61..d835e66693fb4 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:1:42: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
index b05710203fd37..a5781a7f60365 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Override unroll metadata with llvm.loop.unroll.disable.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
index 8992bc942646e..cccf136a1c4ac 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Apply two loop transformations. First partial, then full unrolling.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
index 7bea96f791a80..4d499078a4364 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Full unroll of a loop with 5 iterations.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
index 34a6f486e646c..d67472ab86936 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Unrolling with heuristic factor.
 ; Currently not supported and expected to be handled by LLVM's unroll pass.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
index ce2281372a20d..90101b4fde390 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
 ;
 ; Partial unroll by a factor of 4.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
index f6810ba6c48fb..4cfa3fb911515 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;opt-isl>' -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=scop(polly-opt-isl,polly-codegen),simplifycfg' -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Partial unroll by a factor of 4.
 ;
@@ -54,6 +54,6 @@ return:
 ; AST-NEXT:  for (int c0 = 0; c0 < n; c0 += 4) {
 
 
-; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit, !llvm.loop ![[LOOPID:[0-9]+]]
+; CODEGEN: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.exiting, !llvm.loop ![[LOOPID:[0-9]+]]
 ; CODEGEN: ![[LOOPID]] = distinct !{![[LOOPID]], ![[LOOPNAME:[0-9]+]]}
 ; CODEGEN: ![[LOOPNAME]] = !{!"llvm.loop.id", !"This-is-the-unrolled-loop"}
diff --git a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
index b03d475dd42ee..3f6f50e34775d 100644
--- a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
+++ b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-custom<opt-isl;ast;codegen>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-opt-isl,print<polly-ast>,polly-codegen' -disable-output < %s | FileCheck %s
 ;
 ; Check that there are no nested #pragma omp parallel for inside a
 ; #pragma omp parallel for loop.
diff --git a/polly/test/ScheduleOptimizer/computeout.ll b/polly/test/ScheduleOptimizer/computeout.ll
index 6f34f4efc0a6d..a3286b481ffb3 100644
--- a/polly/test/ScheduleOptimizer/computeout.ll
+++ b/polly/test/ScheduleOptimizer/computeout.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
index 4be0b948d09a0..928ee858ae6d2 100644
--- a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
+++ b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
@@ -1,4 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=1 -polly-target-vector-register-bitwidth=4096 -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal \
+; RUN:     '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
+; RUN:     -polly-target-throughput-vector-fma=1 \
+; RUN:     -polly-target-latency-vector-fma=1 \
+; RUN:     -polly-target-vector-register-bitwidth=4096 \
+; RUN:     -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s
 ;
 ;     /* Test that Polly does not crash due to configurations that can lead to
 ;    incorrect tile size computations.
diff --git a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
index 548a8aa94afbf..b533cb870bdcb 100644
--- a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
+++ b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-opt-isl>)" -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR46578
 ;
diff --git a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
index 6de5e3a606aa3..3dd579ed736f7 100644
--- a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
+++ b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ; CHECK:          // 1st level tiling - Tiles
 ; CHECK-NEXT:    #pragma known-parallel
 ; CHECK-NEXT:    for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling-2.ll b/polly/test/ScheduleOptimizer/line-tiling-2.ll
index 6256adfcd6917..3a2c566d19d3d 100644
--- a/polly/test/ScheduleOptimizer/line-tiling-2.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 7; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling.ll b/polly/test/ScheduleOptimizer/line-tiling.ll
index 51e02594aa880..0dbdeff4742b9 100644
--- a/polly/test/ScheduleOptimizer/line-tiling.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 511; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
index 79deedc7cd830..8f270b94617fe 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
@@ -1,4 +1,13 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-optimized-scops -polly-target-vector-register-bitwidth=256 -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-optimized-scops \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -disable-output < %s
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
index e3ae1a02bd347..de1c815f92350 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-2nd-cache-level-size=262144 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    /* _PB_NK % Kc != 0 */
diff --git a/polly/test/ScheduleOptimizer/one-dimensional-band.ll b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
index f37f1e5119a9f..a097d4a43cfd2 100644
--- a/polly/test/ScheduleOptimizer/one-dimensional-band.ll
+++ b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void jacobi1d(long T, long N, float *A, float *B) {
 ;      long t, i, j;
diff --git a/polly/test/ScheduleOptimizer/outer_coincidence.ll b/polly/test/ScheduleOptimizer/outer_coincidence.ll
index e0a7a63cda80d..7c1af80c9ffae 100644
--- a/polly/test/ScheduleOptimizer/outer_coincidence.ll
+++ b/polly/test/ScheduleOptimizer/outer_coincidence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=OUTER
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no  '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=OUTER
 
 ; By skewing, the diagonal can be made parallel. ISL does this when the Check
 ; the 'outer_coincidence' option is enabled.
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 84f1ca0dba652..a19b93d9915dd 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,4 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true '-passes=polly-custom<optree;delicm;simplify;opt-isl>' -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: '-passes=polly-optree,polly-delicm,polly-simplify,polly-opt-isl' \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s
 ; REQUIRES: asserts
 
 ; Check that the pattern matching detects the matrix multiplication pattern
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
index 72fb4f1b4e41c..4ef0605a0ba75 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm;simplify-1;opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-opt-isl' \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Check that the pattern matching detects the tensor contraction pattern
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
index 933b2d4d258e7..09118e252233b 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -1,7 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=false -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=false \
+; RUN: -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
index 03e23038877e5..b771d1f87537e 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -1,4 +1,16 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl>' -polly-import-jscop-postfix=transformed -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl' \
+; RUN: -polly-import-jscop-postfix=transformed \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -debug \
+; RUN: -polly-tc-opt=true -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Check that the pattern matching detects the matrix multiplication pattern
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
index 4e174e3c9723d..238f6dd798e68 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -passes=polly-opt-isl -disable-output < %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
index c3d8b6ed3fee5..0e4540eb7ba3c 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=2 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=128 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=2 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=128 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
index 3705c3fd27ed9..9678ad83ff048 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
@@ -1,4 +1,13 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;opt-isl;ast;codegen>' -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl,polly-codegen'  \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-import-jscop-postfix=transformed -S < %s \
+; RUN: | FileCheck %s
 ;
 ; Check that we disable the Loop Vectorizer.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
index 7ada105828b27..e74884d59c311 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug-only=polly-opt-isl -disable-output -polly-tc-opt=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -debug-only=polly-opt-isl -disable-output \
+; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (i = 0; i < _PB_NI; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
index 6647380b2d070..9c99a090b69e7 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 1024; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
index fba77d5e4f82d..8e14035ce8629 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
index 488436064ae83..4f562c306f96a 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
index c7a5d475bef31..32ded897d4ff9 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;   for (i = 0; i < 8; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
index 1dba8bece8072..f0c0177da84b0 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
index 3656a9457cef2..155177bdfade0 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;   for (i = 0; i < 16; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
index bd0cb054957af..3d21ac3859a7e 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (int i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
index 6e6788be2973f..00a4bf885aef8 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (int i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
index 82356ae0a398d..bfe5c5249a3a8 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-reschedule=0 '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -passes=polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
+; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;      for (i = 0; i < 1024; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
index ea28bb8c0bdb6..a2e1ced3e6320 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (int i = 0; i < 32; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
index f80d63cd4d66c..9844d377e609d 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -1,6 +1,19 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=0 -polly-target-vector-register-bitwidth=256 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-size=0 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s
 
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
index 100b17e2ccd21..250641d57bac5 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -1,5 +1,13 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -polly-pattern-matching-based-opts=true -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 -polly-tc-opt=true -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-tc-opt=true -disable-output < %s |  \
+; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; REQUIRES: asserts
 ;
 ;    C := A * B + C
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
index 050af1b2377d3..ad2c195ba1e8e 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; -polly-target-throughput-vector-fma=1 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
index ba1ddfef6a4e4..1d3cdbdbfdd85 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;  opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ;  -polly-target-throughput-vector-fma=1 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
index e50b3a0a3f2b0..59eaa4a0928e9 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the float type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
index 3f57fe8cf6c73..2544d502a2dc5 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
@@ -1,4 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices B, C have the double type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
index b87ed4fb1ec3c..85c143562f5af 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
@@ -1,6 +1,14 @@
-; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true -polly-target-throughput-vector-fma=1 -polly-target-latency-vector-fma=8 -polly-target-1st-cache-level-associativity=8 -polly-target-2nd-cache-level-associativity=8 -polly-target-1st-cache-level-size=32768 -polly-target-vector-register-bitwidth=256 -polly-target-2nd-cache-level-size=262144 '-passes=polly-custom<opt-isl>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
+; RUN: -polly-target-throughput-vector-fma=1 \
+; RUN: -polly-target-latency-vector-fma=8 \
+; RUN: -polly-target-1st-cache-level-associativity=8 \
+; RUN: -polly-target-2nd-cache-level-associativity=8 \
+; RUN: -polly-target-1st-cache-level-size=32768 \
+; RUN: -polly-target-vector-register-bitwidth=256 \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -passes=polly-opt-isl -disable-output < %s
 ;
-; RUN: opt %loadNPMPolly '-passes=polly-custom<deps>' -polly-print-deps -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the char type. */
diff --git a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
index 98c1db6d36fbe..64285891a16c7 100644
--- a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
+++ b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;opt-isl>' -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -passes=polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) {
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index 4784dc88cd307..1c6d289744e39 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 6d1592c4ba8fa..1ff20d165ce5e 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=VEC16
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine                         '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=VEC16
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
index f346e5365b198..0bc3c2cf642e8 100644
--- a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine '-passes=polly-custom<opt-isl>' -polly-debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) {
diff --git a/polly/test/ScheduleOptimizer/rectangular-tiling.ll b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
index 3fd4907909419..e1d768b351d7d 100644
--- a/polly/test/ScheduleOptimizer/rectangular-tiling.ll
+++ b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=NOTILING
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
-; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16                                                                                                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false                                                                                                                    '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=NOTILING
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8                                                                               '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
 
 ; CHECK: // 1st level tiling - Tiles
 ; CHECK: for (int c0 = 0; c0 <= 3; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
index 1ee8a90473bd3..1e1359e3ecc6a 100644
--- a/polly/test/ScheduleOptimizer/schedule_computeout.ll
+++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -S '-passes=polly-custom<optree;delicm;opt-isl>' -polly-schedule-computeout=10000 -debug-only=polly-opt-isl < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-optree -passes=polly-delicm  -passes=polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Bailout if the computations of schedule compute exceeds the max scheduling quota.
diff --git a/polly/test/ScheduleOptimizer/statistics.ll b/polly/test/ScheduleOptimizer/statistics.ll
index bb705ac6abf38..84eb59341d273 100644
--- a/polly/test/ScheduleOptimizer/statistics.ll
+++ b/polly/test/ScheduleOptimizer/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<opt-isl>' -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScheduleOptimizer/tile_after_fusion.ll b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
index e3d7c24ebef77..50a46d66176ea 100644
--- a/polly/test/ScheduleOptimizer/tile_after_fusion.ll
+++ b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-custom<opt-isl;ast>' -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;
 ;    void tf(int C[256][256][256], int A0[256][256][256], int A1[256][256][256]) {
diff --git a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
index bb472b9c3763f..e59a31665d77b 100644
--- a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
+++ b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=polly-custom<opt-isl>' -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 ; isl_schedule_node_band_sink may sink into multiple children.
 ; https://llvm.org/PR52637
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
index d83c822371b6e..cee1c06cf7aa0 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
index 63c9addd0b6e1..5506b3c626cfd 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_1.ll b/polly/test/ScopDetect/aliasing_simple_1.ll
index ea8a7688f3d25..5f43ec1856a7f 100644
--- a/polly/test/ScopDetect/aliasing_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_2.ll b/polly/test/ScopDetect/aliasing_simple_2.ll
index df68289ff7352..e853dfcc64485 100644
--- a/polly/test/ScopDetect/aliasing_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/base_pointer.ll b/polly/test/ScopDetect/base_pointer.ll
index 0f0e219bd90d1..e500f9bc20bc6 100644
--- a/polly/test/ScopDetect/base_pointer.ll
+++ b/polly/test/ScopDetect/base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly --aa-pipeline= -polly-invariant-load-hoisting=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -disable-basic-aa -polly-invariant-load-hoisting=true -polly-print-detect -disable-output < %s | FileCheck %s
 
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
index b00ec77679063..eeb9e11f812c3 100644
--- a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts;import-jscop>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This violated an assertion in setNewAccessRelation that assumed base pointers
 ; to be load-hoisted. Without this assertion, it codegen would generate invalid
diff --git a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
index 1cd04b639fc99..16976e6313275 100644
--- a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;import-jscop>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s --allow-empty
 ;
 ; Polly codegen used to generate invalid code (referring to %ptr from the
 ; original region) when regeneration of the access function is necessary.
@@ -35,5 +35,3 @@ exit:
 
 
 ; CHECK-NOT: Valid Region for Scop
-; CHECK: Detected Scops in Function base_pointer_is_inst_inside_invariant_1
-; CHECK-NOT: Valid Region for Scop
diff --git a/polly/test/ScopDetect/callbr.ll b/polly/test/ScopDetect/callbr.ll
index 4200339a04a13..4182974693678 100644
--- a/polly/test/ScopDetect/callbr.ll
+++ b/polly/test/ScopDetect/callbr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-track-failures -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STAT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -stats                            < %s 2>&1 | FileCheck %s --check-prefix=STAT
 ; REQUIRES: asserts
 
 ; REMARK: Branch from indirect terminator.
diff --git a/polly/test/ScopDetect/collective_invariant_loads.ll b/polly/test/ScopDetect/collective_invariant_loads.ll
index f5263e4e4c40a..f451bccec706f 100644
--- a/polly/test/ScopDetect/collective_invariant_loads.ll
+++ b/polly/test/ScopDetect/collective_invariant_loads.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting -disable-output< %s 2>&1 | FileCheck %s
 
 ;CHECK:     Function: test_init_chpl
 ;CHECK-NEXT:     Region: %bb1---%bb16
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit.ll b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
index d7605c36d449c..fe3922174c07c 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
index c3a2ad4791ba7..4cac173932a6f 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
index e896e18589e94..7d7476471bb6e 100644
--- a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
+++ b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define void @f(ptr %A, i64 %N, i64 %M) nounwind {
diff --git a/polly/test/ScopDetect/detect-full-functions.ll b/polly/test/ScopDetect/detect-full-functions.ll
index adad0e89ffa42..178ef32827cab 100644
--- a/polly/test/ScopDetect/detect-full-functions.ll
+++ b/polly/test/ScopDetect/detect-full-functions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-process-unprofitable=false -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-process-unprofitable=false -disable-output -polly-detect-full-functions  < %s 2>&1 | FileCheck %s
 
 ; Verify if a simple function with basic block not part of loop doesn't crash with polly-process-unprofitable=false and polly-detect-full-functions flags.
 
diff --git a/polly/test/ScopDetect/dom-tree-crash.ll b/polly/test/ScopDetect/dom-tree-crash.ll
index 0f670ca230824..efc732c50e177 100644
--- a/polly/test/ScopDetect/dom-tree-crash.ll
+++ b/polly/test/ScopDetect/dom-tree-crash.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Detected Scops in Function foo
 
diff --git a/polly/test/ScopDetect/dot-scops-npm.ll b/polly/test/ScopDetect/dot-scops-npm.ll
index de1f52813475a..d14bf8a23a166 100644
--- a/polly/test/ScopDetect/dot-scops-npm.ll
+++ b/polly/test/ScopDetect/dot-scops-npm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-dot -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-scop-printer' -disable-output < %s
 ; RUN: FileCheck %s -input-file=scops.func_npm.dot
 ;
 ; Check that the ScopPrinter does not crash.
diff --git a/polly/test/ScopDetect/dot-scops.ll b/polly/test/ScopDetect/dot-scops.ll
index a719d21300b15..63163b23617cf 100644
--- a/polly/test/ScopDetect/dot-scops.ll
+++ b/polly/test/ScopDetect/dot-scops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,polly-scop-printer' -disable-output < %s
 ;
 ; Check that the ScopPrinter does not crash.
 ; ScopPrinter needs the ScopDetection pass, which should depend on
diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll
index 0e82e37d10095..20d02b1c1ae0b 100644
--- a/polly/test/ScopDetect/error-block-always-executed.ll
+++ b/polly/test/ScopDetect/error-block-always-executed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
index 338fe20679bcf..6c66f6df14af5 100644
--- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll
+++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-unreachable.ll b/polly/test/ScopDetect/error-block-unreachable.ll
index 85f248da9be18..6ba7698a972bb 100644
--- a/polly/test/ScopDetect/error-block-unreachable.ll
+++ b/polly/test/ScopDetect/error-block-unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 
 ; Verify that the scop detection does not crash on inputs with unreachable
 ; blocks. Earlier we crashed when detecting error blocks.
diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll
index 43fdda8321cbe..a5c9626d28361 100644
--- a/polly/test/ScopDetect/expand-region-correctly-2.ll
+++ b/polly/test/ScopDetect/expand-region-correctly-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 ;
diff --git a/polly/test/ScopDetect/expand-region-correctly.ll b/polly/test/ScopDetect/expand-region-correctly.ll
index b4caac4478d1d..a8c90c08fde0c 100644
--- a/polly/test/ScopDetect/expand-region-correctly.ll
+++ b/polly/test/ScopDetect/expand-region-correctly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 
diff --git a/polly/test/ScopDetect/ignore_func_flag_regex.ll b/polly/test/ScopDetect/ignore_func_flag_regex.ll
index ef1c66686251a..a75e705995a75 100644
--- a/polly/test/ScopDetect/ignore_func_flag_regex.ll
+++ b/polly/test/ScopDetect/ignore_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-polly-ignore-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-ignore-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop.ll b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
index a6f7079f68407..f6d6cfab0eede 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
index be76e0b138933..16d47619b0ff2 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/indvars.ll b/polly/test/ScopDetect/indvars.ll
index e45e4fb016155..3fbc4d65bbe20 100644
--- a/polly/test/ScopDetect/indvars.ll
+++ b/polly/test/ScopDetect/indvars.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll
index 43fa4ca619ed7..58c9197f7f799 100644
--- a/polly/test/ScopDetect/intrinsics_1.ll
+++ b/polly/test/ScopDetect/intrinsics_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: for.cond => for.end
 ;
diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll
index b4cc3df7c746b..f71016e6d04cd 100644
--- a/polly/test/ScopDetect/intrinsics_2.ll
+++ b/polly/test/ScopDetect/intrinsics_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the lifetime markers for the tmp array.
 ;
diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll
index 08fdee573ba0f..579d5bd481d44 100644
--- a/polly/test/ScopDetect/intrinsics_3.ll
+++ b/polly/test/ScopDetect/intrinsics_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the misc intrinsics.
 ;
diff --git a/polly/test/ScopDetect/invalid-latch-conditions.ll b/polly/test/ScopDetect/invalid-latch-conditions.ll
index c7d7c51e7d220..db4898c9c7bd7 100644
--- a/polly/test/ScopDetect/invalid-latch-conditions.ll
+++ b/polly/test/ScopDetect/invalid-latch-conditions.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly                              -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 
 ; The latch conditions of the outer loop are not affine, thus the loop cannot
 ; handled by the domain generation and needs to be overapproximated.
diff --git a/polly/test/ScopDetect/invalidate_scalar_evolution.ll b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
index 977918eb5168d..ddef510ad4d9f 100644
--- a/polly/test/ScopDetect/invalidate_scalar_evolution.ll
+++ b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/invariant-load-before-scop.ll b/polly/test/ScopDetect/invariant-load-before-scop.ll
index 932c218170caf..10479643959cb 100644
--- a/polly/test/ScopDetect/invariant-load-before-scop.ll
+++ b/polly/test/ScopDetect/invariant-load-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; The LoadInst %.b761 is defined outside the SCoP, hence is always constant
 ; within it. It is no "required invariant load".
diff --git a/polly/test/ScopDetect/keep_going_expansion.ll b/polly/test/ScopDetect/keep_going_expansion.ll
index efd81c695ca0d..074aae9ae95c9 100644
--- a/polly/test/ScopDetect/keep_going_expansion.ll
+++ b/polly/test/ScopDetect/keep_going_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/mod_ref_read_pointer.ll b/polly/test/ScopDetect/mod_ref_read_pointer.ll
index c7972cc47a68d..64535d85f2ab1 100644
--- a/polly/test/ScopDetect/mod_ref_read_pointer.ll
+++ b/polly/test/ScopDetect/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop: for.body => for.end
 ; MODREF: Valid Region for Scop: for.body => for.end
diff --git a/polly/test/ScopDetect/more-than-one-loop.ll b/polly/test/ScopDetect/more-than-one-loop.ll
index 1835342812b1f..30090652326d2 100644
--- a/polly/test/ScopDetect/more-than-one-loop.ll
+++ b/polly/test/ScopDetect/more-than-one-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/multidim-with-undef-size.ll b/polly/test/ScopDetect/multidim-with-undef-size.ll
index e89cea98ad21a..2a5f8b15534fa 100644
--- a/polly/test/ScopDetect/multidim-with-undef-size.ll
+++ b/polly/test/ScopDetect/multidim-with-undef-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb14 => bb17
diff --git a/polly/test/ScopDetect/multidim.ll b/polly/test/ScopDetect/multidim.ll
index cbe7d0708b853..91202373263f0 100644
--- a/polly/test/ScopDetect/multidim.ll
+++ b/polly/test/ScopDetect/multidim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb19 => bb20
diff --git a/polly/test/ScopDetect/multidim_indirect_access.ll b/polly/test/ScopDetect/multidim_indirect_access.ll
index 4af37ba064558..a9cd446d27670 100644
--- a/polly/test/ScopDetect/multidim_indirect_access.ll
+++ b/polly/test/ScopDetect/multidim_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we will recognize this SCoP.
 ;
diff --git a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
index 0286642f3c7a7..9c91fbfbe0b64 100644
--- a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
+++ b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopDetect/nested_loop_single_exit.ll b/polly/test/ScopDetect/nested_loop_single_exit.ll
index 89071df596807..a0742112b6e12 100644
--- a/polly/test/ScopDetect/nested_loop_single_exit.ll
+++ b/polly/test/ScopDetect/nested_loop_single_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 
 ; void f(long A[], long N) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/non-affine-conditional.ll b/polly/test/ScopDetect/non-affine-conditional.ll
index b20828d9a7679..e74619cd87756 100644
--- a/polly/test/ScopDetect/non-affine-conditional.ll
+++ b/polly/test/ScopDetect/non-affine-conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-float-compare.ll b/polly/test/ScopDetect/non-affine-float-compare.ll
index 77427397bac9d..9326cd4290380 100644
--- a/polly/test/ScopDetect/non-affine-float-compare.ll
+++ b/polly/test/ScopDetect/non-affine-float-compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
index f6ae9fe8dd544..1ab6b35ae93f1 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Here we have a non-affine loop but also a non-affine access which should
 ; be rejected as long as -polly-allow-nonaffine isn't given.
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
index 23c1765caecac..921f6ab535499 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
index 6e239a6570668..78774d92e0a46 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop.ll b/polly/test/ScopDetect/non-affine-loop.ll
index dd675ccec5999..5136b3b8779b1 100644
--- a/polly/test/ScopDetect/non-affine-loop.ll
+++ b/polly/test/ScopDetect/non-affine-loop.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; This function/region does contain a loop, however it is non-affine, hence the access
 ; A[i] is also. Furthermore, it is the only loop, thus when we over approximate
diff --git a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
index 63b1cdb420b71..fd52c5df7b27e 100644
--- a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
+++ b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid
 ;
diff --git a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
index ff4ad3218ffa5..d0c1f7a613332 100644
--- a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
+++ b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: bb11 => bb25
 
diff --git a/polly/test/ScopDetect/non-simple-memory-accesses.ll b/polly/test/ScopDetect/non-simple-memory-accesses.ll
index 5b9ed2b2ecae7..bdc48984f9961 100644
--- a/polly/test/ScopDetect/non-simple-memory-accesses.ll
+++ b/polly/test/ScopDetect/non-simple-memory-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we do not model atomic memory accesses. We did not reason about
 ; how to handle them correctly and the Alias Set Tracker models some of them
diff --git a/polly/test/ScopDetect/non_affine_loop_condition.ll b/polly/test/ScopDetect/non_affine_loop_condition.ll
index 3c487374c1973..63bd7b3a2f1f2 100644
--- a/polly/test/ScopDetect/non_affine_loop_condition.ll
+++ b/polly/test/ScopDetect/non_affine_loop_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopDetect/only-one-affine-loop.ll b/polly/test/ScopDetect/only-one-affine-loop.ll
index a8ce5bc636833..1d36f4df35bc3 100644
--- a/polly/test/ScopDetect/only-one-affine-loop.ll
+++ b/polly/test/ScopDetect/only-one-affine-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Even if we allow non-affine loops we can only model the outermost loop, all
 ; other loops are boxed in non-affine regions. However, the inner loops can be
diff --git a/polly/test/ScopDetect/only_func_flag.ll b/polly/test/ScopDetect/only_func_flag.ll
index f4f35048fa8a0..4742375fec5cf 100644
--- a/polly/test/ScopDetect/only_func_flag.ll
+++ b/polly/test/ScopDetect/only_func_flag.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` limits analysis to `f` and `g`.
 ;
diff --git a/polly/test/ScopDetect/only_func_flag_regex.ll b/polly/test/ScopDetect/only_func_flag_regex.ll
index f180fa765f4b0..2ad22c9f7a7f5 100644
--- a/polly/test/ScopDetect/only_func_flag_regex.ll
+++ b/polly/test/ScopDetect/only_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-polly-only-func=f.*,g.*' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
index 71d1ba0accd32..271825a58c399 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 
 ; CHECK-NOT: Valid Region
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev.ll b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
index 6768c969a7428..2ab8997c63331 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;  foo(float *A, long n, long k) {
 ;    if (true)
diff --git a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
index 2e16b75ee3106..248bb43aacd98 100644
--- a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
+++ b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Region with an exit node that has a PHI node multiple incoming edges from
 ; inside the region. Motivation for supporting such cases in Polly.
diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
index ac27016e3622d..d74185b45c752 100644
--- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll
+++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
@@ -1,8 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-detect-profitability-min-per-loop-insts=40 '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
+; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE
 
 ; UNPROFITABLE-NOT: Valid Region for Scop:
 ; PROFITABLE: Valid Region for Scop:
diff --git a/polly/test/ScopDetect/profitability-two-nested-loops.ll b/polly/test/ScopDetect/profitability-two-nested-loops.ll
index 80379bcc5d412..0291d3be452a1 100644
--- a/polly/test/ScopDetect/profitability-two-nested-loops.ll
+++ b/polly/test/ScopDetect/profitability-two-nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: next => bb3
 ;
diff --git a/polly/test/ScopDetect/remove_all_children.ll b/polly/test/ScopDetect/remove_all_children.ll
index 1c77d730ed418..d95e9bde0b384 100644
--- a/polly/test/ScopDetect/remove_all_children.ll
+++ b/polly/test/ScopDetect/remove_all_children.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll
index 530a22f9ac3d4..5e4c38db5e53c 100644
--- a/polly/test/ScopDetect/report-scop-location.ll
+++ b/polly/test/ScopDetect/report-scop-location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -polly-report -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-report -disable-output < %s  2>&1 | FileCheck %s
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
diff --git a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
index 2ade0a97a5991..f49190b33ccf7 100644
--- a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
+++ b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ; CHECK-NOT: Valid Region for Scop:
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetect/run_time_alias_check.ll b/polly/test/ScopDetect/run_time_alias_check.ll
index 6f327e318082c..74cbedb34e5c6 100644
--- a/polly/test/ScopDetect/run_time_alias_check.ll
+++ b/polly/test/ScopDetect/run_time_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll
index 4f03845795c9c..f76c832ff08f5 100644
--- a/polly/test/ScopDetect/scev_remove_max.ll
+++ b/polly/test/ScopDetect/scev_remove_max.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 
 ; This test case helps to determine whether SCEVRemoveMax::remove produces
 ; an infinite loop and a segmentation fault, if it processes, for example,
diff --git a/polly/test/ScopDetect/sequential_loops.ll b/polly/test/ScopDetect/sequential_loops.ll
index 338a9ae6b6b0e..4a84f356f3e81 100644
--- a/polly/test/ScopDetect/sequential_loops.ll
+++ b/polly/test/ScopDetect/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopDetect/simple_loop.ll b/polly/test/ScopDetect/simple_loop.ll
index 5da4898517e22..33823b21fb8fb 100644
--- a/polly/test/ScopDetect/simple_loop.ll
+++ b/polly/test/ScopDetect/simple_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_entry.ll b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
index 00e11ab252e73..1bba2c21c7473 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit.ll b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
index 9f75b80f58cef..93ec84e911c5d 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
index c6ce482403400..33b0d8d7d6fc0 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
index c90c4915e866d..9b47b7c946caf 100644
--- a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
+++ b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_with_param.ll b/polly/test/ScopDetect/simple_loop_with_param.ll
index 67f677892313c..4a0a3adab661d 100644
--- a/polly/test/ScopDetect/simple_loop_with_param.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N, long *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_loop_with_param_2.ll b/polly/test/ScopDetect/simple_loop_with_param_2.ll
index 9e7b55efc48d9..670936b6fee80 100644
--- a/polly/test/ScopDetect/simple_loop_with_param_2.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_non_single_entry.ll b/polly/test/ScopDetect/simple_non_single_entry.ll
index e56c022aa5466..6ace3b636019b 100644
--- a/polly/test/ScopDetect/simple_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/skip_function_attribute.ll b/polly/test/ScopDetect/skip_function_attribute.ll
index 789942a950051..2150a3e8c35dd 100644
--- a/polly/test/ScopDetect/skip_function_attribute.ll
+++ b/polly/test/ScopDetect/skip_function_attribute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify polly skips this function
 ;
diff --git a/polly/test/ScopDetect/srem_with_parametric_divisor.ll b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
index 471602968055e..66c3b045f62a4 100644
--- a/polly/test/ScopDetect/srem_with_parametric_divisor.ll
+++ b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/statistics.ll b/polly/test/ScopDetect/statistics.ll
index 5d87599da29f7..a1dcebec63ff8 100644
--- a/polly/test/ScopDetect/statistics.ll
+++ b/polly/test/ScopDetect/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -stats -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopDetect/switch-in-loop-patch.ll b/polly/test/ScopDetect/switch-in-loop-patch.ll
index 1e825f4950afa..2f9b670384db2 100644
--- a/polly/test/ScopDetect/switch-in-loop-patch.ll
+++ b/polly/test/ScopDetect/switch-in-loop-patch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Valid
 
diff --git a/polly/test/ScopDetect/tlr_is_hoistable_load.ll b/polly/test/ScopDetect/tlr_is_hoistable_load.ll
index 24a3f55a519e2..5c33522f62325 100644
--- a/polly/test/ScopDetect/tlr_is_hoistable_load.ll
+++ b/polly/test/ScopDetect/tlr_is_hoistable_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting -polly-detect-full-functions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-invariant-load-hoisting -polly-detect-full-functions -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; This testcase checks for compatibility of the -detect-full-functions
 ; flag in combination with the -invariant-load-hoisting option. More
diff --git a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
index e7245d80b60ed..4ae86a940e0c8 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ;void f(int A[], int B[]) {
 ;  for (int i=0; i<42; i++)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
index 2a0b281073f59..adb14b5b017d4 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Scop contains function entry (not yet supported).
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
index fc4c1fbcef484..428a7cf855f6e 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; #define N 1024
 ; double invalidCall(double A[N]);
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
index 7a540d606eadf..30e5fb9fdeba8 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ;void foo(int a, int b) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
index 512366f1bc7ce..2bc515e0ae5e1 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Irreducible region encountered in control flow.
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index e844aea24ac26..a96b64e4e0d54 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -1,6 +1,16 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures -polly-allow-nonaffine-loops=true '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-process-unprofitable=false -polly-detect-track-failures -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEALL
+; RUN: opt %loadNPMPolly \
+; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
+; RUN:     -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output \
+; RUN:     < %s 2>&1| FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly \
+; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
+; RUN:     -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output \
+; RUN:     < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-process-unprofitable=false \
+; RUN:     -polly-detect-track-failures -polly-allow-nonaffine-loops=true \
+; RUN:     -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=ALLOWNONAFFINEALL
 
 ; void f(int A[], int n) {
 ;   for (int i = 0; i < A[n+i]; i++)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index d80911cc0ec9a..6156efaea1909 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -4,8 +4,8 @@
 ; the PostDominatorTree. Infinite loops are postdominated only by the virtual
 ; root, which causes them not to appear in regions in ScopDetection anymore.
 
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-allow-nonaffine-loops=false '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void func (int param0, int N, int *A)
 ; {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
index d8c2916cc23bb..dd95bd6ede715 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=ALL
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-detect-keep-going -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DELIN-ALL
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
 
 ;  1 void manyaccesses(float A[restrict], long n, float B[restrict][n])
 ;  2 {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
index ee0aa743f434b..13ac9d5ace2d3 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ; void f(int A[]) {
 ;   for(int i=0; i<42; ++i)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
index ad2c813c4b7ce..93e9e8b14038b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
@@ -1,6 +1,10 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
+; RUN:     -polly-process-unprofitable=false < %s 2>&1| FileCheck %s
 
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-process-unprofitable=false -pass-remarks-output=%t.yaml < %s 2>&1
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
+; RUN:     -polly-process-unprofitable=false < %s 2>&1 -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
index d97032c8f8eaf..d110cfefc27dd 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s \
+; RUN:     -pass-remarks-missed="polly-detect" 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
index 7a5025c0c2fbe..5f296fae9532b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ; struct b {
 ;   double **b;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
index e15c045907ddf..3cdeed13ec285 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-missed=polly-detect -polly-detect-track-failures '-passes=polly-custom<detect>' -polly-print-detect -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
 ;
 ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c
 ;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
index b5918d9f7a2d4..4a9a200d67dfd 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
index 502abf8dab6d7..61ff033d9f934 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -pass-remarks-missed=polly-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
diff --git a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
index accb562771819..c5efec3f50c58 100644
--- a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
+++ b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
@@ -1,5 +1,5 @@
 ; This should be run without alias analysis enabled.
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+;RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>'  -disable-output < %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 define i32 @main() nounwind {
diff --git a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
index 57ae977a1a13f..81c7efb963652 100644
--- a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
+++ b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare void @foo()
diff --git a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
index 3cb63cc4f952c..5abf8ff29ef85 100644
--- a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
+++ b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
index 668fcd8fabcaf..d16ba453f9815 100644
--- a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
+++ b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/Alias-0.ll b/polly/test/ScopInfo/Alias-0.ll
index 50c1b65727eaf..ebbe744627ef8 100644
--- a/polly/test/ScopInfo/Alias-0.ll
+++ b/polly/test/ScopInfo/Alias-0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll
index 15fd6c936fc47..b1711c25857d0 100644
--- a/polly/test/ScopInfo/Alias-1.ll
+++ b/polly/test/ScopInfo/Alias-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-2.ll b/polly/test/ScopInfo/Alias-2.ll
index 598ad0fe8cf1c..b94f130c94ebd 100644
--- a/polly/test/ScopInfo/Alias-2.ll
+++ b/polly/test/ScopInfo/Alias-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-3.ll b/polly/test/ScopInfo/Alias-3.ll
index 388a2defec395..af7816546b4ab 100644
--- a/polly/test/ScopInfo/Alias-3.ll
+++ b/polly/test/ScopInfo/Alias-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-4.ll b/polly/test/ScopInfo/Alias-4.ll
index e9f4f95a9997f..fe651c87b241c 100644
--- a/polly/test/ScopInfo/Alias-4.ll
+++ b/polly/test/ScopInfo/Alias-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll
index d44c18cf49e36..0b69beaaf3f9c 100644
--- a/polly/test/ScopInfo/BoundChecks/single-loop.ll
+++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has been fixed.
 ;
diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll
index 9034f75f13792..f2ba17d33c0ea 100644
--- a/polly/test/ScopInfo/BoundChecks/two-loops.ll
+++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has fixed.
 ; XFAIL: *
diff --git a/polly/test/ScopInfo/NonAffine/div_backedge.ll b/polly/test/ScopInfo/NonAffine/div_backedge.ll
index e8edad9494075..3b0c673ece38b 100644
--- a/polly/test/ScopInfo/NonAffine/div_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/div_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 1;; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/div_domain.ll b/polly/test/ScopInfo/NonAffine/div_domain.ll
index c195bb42dac9f..34a5cecdfe3df 100644
--- a/polly/test/ScopInfo/NonAffine/div_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/div_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 0; i < 16; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
index 31ecdaa0ef3e4..7d02fae7f98f3 100644
--- a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
+++ b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B, int *C) {
 ;      for (int i = 0; i < 1000; i++)
diff --git a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
index e0cd1e51a095c..d5c808d9021f2 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   { Stmt_for_body[i0] : 0 <= i0 <= 6 };
diff --git a/polly/test/ScopInfo/NonAffine/modulo_domain.ll b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
index 53bbe15799e61..13fe53f11633d 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: The new domain generation cannot handle modulo domain constraints,
 ;       hence modulo handling has been disabled completely. Once this is
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
index 7d34ef9644b5a..2b8427d74ec84 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT
 ;
 ; SCALAR:      Function: f
 ; SCALAR-NEXT: Region: %bb1---%bb13
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
index a40afdde1237f..30f756e81e474 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
index f3678d3245f57..6dacd719862ef 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
index 85a1081159d59..8a13f791ed6de 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 128; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
index 65513a5d9d1fb..1e70d2c9db87e 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_for_body
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
index 0185774d6274c..dcfaa9280dcb8 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
index ab47dc0b78260..24bfe60502163 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
index 51a7d54562780..931ad36d15f34 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
@@ -1,5 +1,12 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-nonaffine-loops=true \
+; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
+; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s \
+; RUN:     --check-prefix=ALL
 ;
 ; Negative test for INNERMOST.
 ; At the moment we will optimistically assume A[i] in the conditional before the inner
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
index b1f7e65e9dd25..37b51cebd74d5 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
@@ -1,6 +1,16 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-invariant-load-hoisting=true -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-process-unprofitable=false -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-nonaffine-loops=true \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-process-unprofitable=false \
+; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Negative test for INNERMOST.
 ; At the moment we will optimistically assume A[i] in the conditional before the inner
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
index ac77dfb7454d3..7bfd7f86efcdb 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
index db08544aa559c..fc779d544e62f 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false    '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION
 ;
 ;    void f(int *A, int *C) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
index cde2dc495d549..63ff354d7e5f7 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Verify that we over approximate the read access of A[j] in the last statement as j is
 ; computed in a non-affine loop we do not model.
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
index ce4cc6189d45c..d33befe2c66e0 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, double A[], int INDEX[]) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
index b46ce87a45e2d..77c2df48d6514 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
index 58e5ccd9b6e36..9ed340d1d304b 100644
--- a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
+++ b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Regression test that triggered a memory leak at some point (24947).
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
index d94fc5f8a8823..cbd024ba7a392 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no alias group because we either access A or B never both.
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
index df7f75dd8d95e..3858d8a7bb1d6 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we create two alias groups since the minimal/maximal accesses
 ; depend on %b.
diff --git a/polly/test/ScopInfo/aliasing_dead_access.ll b/polly/test/ScopInfo/aliasing_dead_access.ll
index 0ebc39c0e5a78..7baa3dce1f9db 100644
--- a/polly/test/ScopInfo/aliasing_dead_access.ll
+++ b/polly/test/ScopInfo/aliasing_dead_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not create a SCoP if there is no statement executed.
 ;
diff --git a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
index 8e5bab661e18c..7265aab22a490 100644
--- a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
+++ b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
@@ -1,5 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=FOUND
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s --check-prefix=IGNORED
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                < %s 2>&1 | FileCheck %s --check-prefix=FOUND
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s \
+; RUN:                --check-prefix=IGNORED
 ;
 ; FOUND: Function: foo
 ; IGNORED-NOT: Function: foo
diff --git a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll
index aec6ea0bf1441..c7592bcb09fcf 100644
--- a/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll
+++ b/polly/test/ScopInfo/aliasing_many_parameters_not_all_involved.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8
-; RUN: opt %loadNPMPolly -polly-analysis-computeout=0 '-passes=polly-custom<scops>' -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7
+; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=8 -disable-output < %s | FileCheck %s --check-prefix=MAX8
+; RUN: opt %loadPolly -polly-analysis-computeout=0 -polly-print-scops -polly-rtc-max-parameters=7 -disable-output < %s | FileCheck %s --check-prefix=MAX7
 ;
 ; Check that we allow this SCoP even though it has 10 parameters involved in possibly aliasing accesses.
 ; However, only 7 are involved in accesses through B, 8 through C and none in accesses through A.
diff --git a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
index a7dbe0baeae5d..d66a10bc511b1 100644
--- a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
+++ b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: { : }
diff --git a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
index db54a1687b4d5..9943802ec8595 100644
--- a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline= < %s 2>&1 | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -aa-pipeline=tbaa < %s 2>&1 | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=         < %s 2>&1 | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=tbaa    < %s 2>&1 | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
index 0001b8adb41e1..900d5d40d96f5 100644
--- a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
+++ b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; @test1
 ; Make sure we generate the correct aliasing check for a fixed-size memset operation.
diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
index 93253b7e65d4a..70c3c56fb3112 100644
--- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
+++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
@@ -1,9 +1,14 @@
-; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting \
+; RUN: -polly-allow-dereference-of-all-function-parameters \
+; RUN: '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 
-; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
+; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC
 
 
-; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting -polly-allow-dereference-of-all-function-parameters '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=CODE
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
+; RUN: -polly-allow-dereference-of-all-function-parameters \
+; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE
 
 ; SCOP:      Function: hoge
 ; SCOP-NEXT: Region: %bb15---%bb37
diff --git a/polly/test/ScopInfo/assume_gep_bounds.ll b/polly/test/ScopInfo/assume_gep_bounds.ll
index 994d49e5b887f..bd14e3868d525 100644
--- a/polly/test/ScopInfo/assume_gep_bounds.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;    void foo(float A[][20][30], long n, long m, long p) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/ScopInfo/assume_gep_bounds_2.ll b/polly/test/ScopInfo/assume_gep_bounds_2.ll
index be43be598bd3d..7a8c1870abe25 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_2.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-inbounds < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-precise-inbounds | FileCheck %s
 ;
 ;    void foo(float A[restrict][20], float B[restrict][20], long n, long m,
 ;             long p) {
diff --git a/polly/test/ScopInfo/assume_gep_bounds_many.ll b/polly/test/ScopInfo/assume_gep_bounds_many.ll
index cfd9008741c3a..01fc12cd7f108 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_many.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_many.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' -polly-ignore-aliasing \
+; RUN:    < %s 2>&1 | FileCheck %s
 
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: [n1_a, n1_b, n1_c, n1_d, n2_a, n2_b, n2_c, n2_d, n3_a, n3_b, n3_c, n3_d, n4_a, n4_b, n4_c, n4_d, n5_a, n5_b, n5_c, n5_d, n6_a, n6_b, n6_c, n6_d, n7_a, n7_b, n7_c, n7_d, n8_a, n8_b, n8_c, n8_d, n9_a, n9_b, n9_c, n9_d, p1_b, p1_c, p1_d, p2_b, p2_c, p2_d, p3_b, p3_c, p3_d, p4_b, p4_c, p4_d, p5_b, p5_c, p5_d, p6_b, p6_c, p6_d, p7_b, p7_c, p7_d, p8_b, p8_c, p8_d, p9_b, p9_c, p9_d] -> {  : p1_b >= n1_b and p1_c >= n1_c and p1_d >= n1_d and p2_b >= n2_b and p2_c >= n2_c and p2_d >= n2_d and p3_b >= n3_b and p3_c >= n3_c and p3_d >= n3_d and p4_b >= n4_b and p4_c >= n4_c and p4_d >= n4_d and p5_b >= n5_b and p5_c >= n5_c and p5_d >= n5_d and p6_b >= n6_b and p6_c >= n6_c and p6_d >= n6_d and p7_b >= n7_b and p7_c >= n7_c and p7_d >= n7_d and p8_b >= n8_b and p8_c >= n8_c and p8_d >= n8_d and p9_b >= n9_b and p9_c >= n9_c and p9_d >= n9_d }
diff --git a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
index b3aa7686d3010..3fb7a1329c745 100644
--- a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
+++ b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do no introduce a parameter here that is actually not needed.
 ;
diff --git a/polly/test/ScopInfo/bool-addrec.ll b/polly/test/ScopInfo/bool-addrec.ll
index 01c6d52c30f76..81fcade08f65a 100644
--- a/polly/test/ScopInfo/bool-addrec.ll
+++ b/polly/test/ScopInfo/bool-addrec.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<ast>' -polly-print-ast -polly-process-unprofitable < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-ast>' -polly-process-unprofitable < %s 2>&1 | FileCheck %s
 
 ; CHECK:      for (int c0 = 0; c0 <= 19999; c0 += 1) {
 ; CHECK-NEXT:   if (c0 % 2 == 0)
diff --git a/polly/test/ScopInfo/bounded_loop_assumptions.ll b/polly/test/ScopInfo/bounded_loop_assumptions.ll
index 21ba391f4fc1a..5628092de7765 100644
--- a/polly/test/ScopInfo/bounded_loop_assumptions.ll
+++ b/polly/test/ScopInfo/bounded_loop_assumptions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context is tricky here as the equality test for the inner loop
 ; allows an "unbounded" loop trip count. We assume that does not happen, thus
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
index d25a8e666b525..83743e4e4ecc7 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
@@ -1,6 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | \
+; RUN:     FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOP
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
+; RUN:     FileCheck %s -check-prefix=SCOP
 
 ; DETECT: Valid Region for Scop: loop => barrier
 ; DETECT-NEXT: Valid Region for Scop: branch => end
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
index 91aa96e0f3501..9685ba37a49a1 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
@@ -1,5 +1,8 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
+; RUN:     FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
+; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE:      Statements {
 ; NONAFFINE-NEXT: 	Stmt_loop
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
index 22a60c764eb4d..f41e6500fb30a 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
@@ -1,5 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output -polly-allow-nonaffine-branches=false < %s 2>&1 | FileCheck %s -check-prefix=NO-NONEAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | \
+; RUN:     FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
+; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE-NOT: Statements
 
diff --git a/polly/test/ScopInfo/bug_2010_10_22.ll b/polly/test/ScopInfo/bug_2010_10_22.ll
index 1d248891dfd09..71e7051922b53 100644
--- a/polly/test/ScopInfo/bug_2010_10_22.ll
+++ b/polly/test/ScopInfo/bug_2010_10_22.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/bug_2011_1_5.ll b/polly/test/ScopInfo/bug_2011_1_5.ll
index 7c76c3eaa565a..f4a24e06f46ae 100644
--- a/polly/test/ScopInfo/bug_2011_1_5.ll
+++ b/polly/test/ScopInfo/bug_2011_1_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Bug description: Alias Analysis thinks IntToPtrInst aliases with alloca instructions created by IndependentBlocks Pass.
 ;                  This will trigger the assertion when we are verifying the SCoP after IndependentBlocks.
diff --git a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
index 6e1ef2339a81d..ed6bbafdac1f0 100644
--- a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
+++ b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @edge.8265 = external global [72 x i32], align 32 ; <ptr> [#uses=1]
diff --git a/polly/test/ScopInfo/cfg_consequences.ll b/polly/test/ScopInfo/cfg_consequences.ll
index 2b702e235ca6c..9161d3db4167a 100644
--- a/polly/test/ScopInfo/cfg_consequences.ll
+++ b/polly/test/ScopInfo/cfg_consequences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void consequences(int *A, int bool_cond, int lhs, int rhs) {
 ;
diff --git a/polly/test/ScopInfo/complex-branch-structure.ll b/polly/test/ScopInfo/complex-branch-structure.ll
index f48089afb93b9..de79c2226e68d 100644
--- a/polly/test/ScopInfo/complex-branch-structure.ll
+++ b/polly/test/ScopInfo/complex-branch-structure.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop of the following form to check that the domain construction
 ; does not take a huge amount of time, but that we instead just bail out.
diff --git a/polly/test/ScopInfo/complex-condition.ll b/polly/test/ScopInfo/complex-condition.ll
index 9164959c1f6dc..c3b8d2bb0ef88 100644
--- a/polly/test/ScopInfo/complex-condition.ll
+++ b/polly/test/ScopInfo/complex-condition.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption: {  : false }
 ;
diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll
index 456edb04e0c2b..4a2a1d2a64a6d 100644
--- a/polly/test/ScopInfo/complex-expression.ll
+++ b/polly/test/ScopInfo/complex-expression.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This test case has an SCEVSMax expression with a very high arity. The
 ; piecewise affine function we would create for it would have a huge amount of
diff --git a/polly/test/ScopInfo/complex-loop-nesting.ll b/polly/test/ScopInfo/complex-loop-nesting.ll
index 4ffd8689f1a4a..36cb078f19fff 100644
--- a/polly/test/ScopInfo/complex-loop-nesting.ll
+++ b/polly/test/ScopInfo/complex-loop-nesting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/complex-successor-structure-2.ll b/polly/test/ScopInfo/complex-successor-structure-2.ll
index 32425d7598bc9..f4a78bf753853 100644
--- a/polly/test/ScopInfo/complex-successor-structure-2.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-2.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop for the region for.body->B13. The CFG is of the following
 ; form and the branch conditions are build from "smax" SCEVs. However, in
diff --git a/polly/test/ScopInfo/complex-successor-structure-3.ll b/polly/test/ScopInfo/complex-successor-structure-3.ll
index c01eca534bcf1..6da1fe3a8b9f3 100644
--- a/polly/test/ScopInfo/complex-successor-structure-3.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-3.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; Check that propagation of domains from A(X) to A(X+1) will keep the
 ; domains small and concise.
diff --git a/polly/test/ScopInfo/complex-successor-structure.ll b/polly/test/ScopInfo/complex-successor-structure.ll
index 1b39f4cf192eb..6c87ba3e98505 100644
--- a/polly/test/ScopInfo/complex-successor-structure.ll
+++ b/polly/test/ScopInfo/complex-successor-structure.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop from the region for.body->B13. The CFG is of the
 ; following form. The test checks that the condition construction does not take
diff --git a/polly/test/ScopInfo/complex_domain_binary_condition.ll b/polly/test/ScopInfo/complex_domain_binary_condition.ll
index 42a114eaa6ec1..6e28c9dfee06a 100644
--- a/polly/test/ScopInfo/complex_domain_binary_condition.ll
+++ b/polly/test/ScopInfo/complex_domain_binary_condition.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption: {  : false }
 ;
diff --git a/polly/test/ScopInfo/complex_execution_context.ll b/polly/test/ScopInfo/complex_execution_context.ll
index 9896fba8904b8..9880a1dd67d19 100644
--- a/polly/test/ScopInfo/complex_execution_context.ll
+++ b/polly/test/ScopInfo/complex_execution_context.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption:
 ;
diff --git a/polly/test/ScopInfo/cond_constant_in_loop.ll b/polly/test/ScopInfo/cond_constant_in_loop.ll
index ecc2767fd6ecd..552fddc6ff08c 100644
--- a/polly/test/ScopInfo/cond_constant_in_loop.ll
+++ b/polly/test/ScopInfo/cond_constant_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/cond_in_loop.ll b/polly/test/ScopInfo/cond_in_loop.ll
index 0f31904133719..c06dcd955bac1 100644
--- a/polly/test/ScopInfo/cond_in_loop.ll
+++ b/polly/test/ScopInfo/cond_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/condition-after-error-block-2.ll b/polly/test/ScopInfo/condition-after-error-block-2.ll
index 257b2ede236d9..8c4b2170ad69b 100644
--- a/polly/test/ScopInfo/condition-after-error-block-2.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we do not allow PHI nodes such as %phi, if they reference an error
 ; block and are used by anything else than a terminator instruction.
diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
index d86b48ed24963..d5069da916fa1 100644
--- a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/condtion-after-error-block.ll b/polly/test/ScopInfo/condtion-after-error-block.ll
index 8ad98b4a4a78e..d9de4fc40a208 100644
--- a/polly/test/ScopInfo/condtion-after-error-block.ll
+++ b/polly/test/ScopInfo/condtion-after-error-block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we allow scops containing uniform branch conditions, where all
 ; but one incoming block comes from an error condition.
diff --git a/polly/test/ScopInfo/const_srem_sdiv.ll b/polly/test/ScopInfo/const_srem_sdiv.ll
index b50c4bd910dda..b4c2f119fe053 100644
--- a/polly/test/ScopInfo/const_srem_sdiv.ll
+++ b/polly/test/ScopInfo/const_srem_sdiv.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; See http://research.microsoft.com/pubs/151917/divmodnote-letter.pdf
 ;
diff --git a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
index f09f82f32c93a..86dd94e3371b2 100644
--- a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
+++ b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; At some point this caused a problem in the domain generation as we
 ; assumed any constant branch condition to be valid. However, only constant
diff --git a/polly/test/ScopInfo/constant_factor_in_parameter.ll b/polly/test/ScopInfo/constant_factor_in_parameter.ll
index 26c73bd72271b..b58d413e074e7 100644
--- a/polly/test/ScopInfo/constant_factor_in_parameter.ll
+++ b/polly/test/ScopInfo/constant_factor_in_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<scops>' -polly-print-scops < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constant part of the N * M * 4 expression is not part of the
 ; parameter but explicit in the access function. This can avoid existentially
diff --git a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
index 762132f9edd78..62e6cd4641de1 100644
--- a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
+++ b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/polly/test/ScopInfo/constant_start_integer.ll b/polly/test/ScopInfo/constant_start_integer.ll
index 6d17288b28227..8991f8250f0b7 100644
--- a/polly/test/ScopInfo/constant_start_integer.ll
+++ b/polly/test/ScopInfo/constant_start_integer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(float *input) {
diff --git a/polly/test/ScopInfo/debug_call.ll b/polly/test/ScopInfo/debug_call.ll
index 63c1baca5accc..a6761ecebe6a7 100644
--- a/polly/test/ScopInfo/debug_call.ll
+++ b/polly/test/ScopInfo/debug_call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the call to dbg_printf is accepted as a debug-function.
 ;
diff --git a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
index 7126fb95cd00c..676c8a27e5749 100644
--- a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
+++ b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
 ;   for (long i = 0; i < n-3; i++)
diff --git a/polly/test/ScopInfo/div_by_zero.ll b/polly/test/ScopInfo/div_by_zero.ll
index 62a13de7ceac0..aecd16833b84e 100644
--- a/polly/test/ScopInfo/div_by_zero.ll
+++ b/polly/test/ScopInfo/div_by_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
index 333175b417ade..a3ca59563ab1f 100644
--- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
+++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Check that we do not crash on this input. Earlier this indeed crashed as
 ; we tried to model the access functions in an error block.
diff --git a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
index b111851939d06..a988b3f8c2b01 100644
--- a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
+++ b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
index 3ee6ff7889c84..9a1edcbfb7796 100644
--- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll
+++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check we do not crash.
 ;
diff --git a/polly/test/ScopInfo/error-blocks-1.ll b/polly/test/ScopInfo/error-blocks-1.ll
index 902ea15752980..047b095a95947 100644
--- a/polly/test/ScopInfo/error-blocks-1.ll
+++ b/polly/test/ScopInfo/error-blocks-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Context:
 ; CHECK-NEXT:    [N] -> {  : -2147483648 <= N <= 2147483647 }
diff --git a/polly/test/ScopInfo/error-blocks-2.ll b/polly/test/ScopInfo/error-blocks-2.ll
index 613b00a1a9ba7..6fa12947540c0 100644
--- a/polly/test/ScopInfo/error-blocks-2.ll
+++ b/polly/test/ScopInfo/error-blocks-2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/error-blocks-3.ll b/polly/test/ScopInfo/error-blocks-3.ll
index 9521037888075..e7643601356db 100644
--- a/polly/test/ScopInfo/error-blocks-3.ll
+++ b/polly/test/ScopInfo/error-blocks-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-scops -polly-detect-keep-going -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ; The instruction
 ;
diff --git a/polly/test/ScopInfo/escaping_empty_scop.ll b/polly/test/ScopInfo/escaping_empty_scop.ll
index d47b2865b4ee0..2efaef3fb99b8 100644
--- a/polly/test/ScopInfo/escaping_empty_scop.ll
+++ b/polly/test/ScopInfo/escaping_empty_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void g();
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/exit-phi-1.ll b/polly/test/ScopInfo/exit-phi-1.ll
index 21f13cf4f4e4d..cbd6c280e8caa 100644
--- a/polly/test/ScopInfo/exit-phi-1.ll
+++ b/polly/test/ScopInfo/exit-phi-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Check for correct code generation of exit PHIs, even if the same PHI value
 ; is used again inside the the SCoP.
diff --git a/polly/test/ScopInfo/exit-phi-2.ll b/polly/test/ScopInfo/exit-phi-2.ll
index b8da9ab5b64f9..695c617b14c1f 100644
--- a/polly/test/ScopInfo/exit-phi-2.ll
+++ b/polly/test/ScopInfo/exit-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no MK_ExitPHI READ access.
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses-2.ll b/polly/test/ScopInfo/exit_phi_accesses-2.ll
index 928b564c7cef5..b3b7cb1c65993 100644
--- a/polly/test/ScopInfo/exit_phi_accesses-2.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: Function: foo
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses.ll b/polly/test/ScopInfo/exit_phi_accesses.ll
index a54ca4a185ae2..77b038ec8e4af 100644
--- a/polly/test/ScopInfo/exit_phi_accesses.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that PHI nodes only create PHI access and nothing else (e.g. unnecessary
 ; SCALAR accesses). In this case, for a PHI in the exit node, hence there is no
diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll
index c0d2dcd16289d..95212f83acdca 100644
--- a/polly/test/ScopInfo/expensive-boundary-context.ll
+++ b/polly/test/ScopInfo/expensive-boundary-context.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT:   Assumed Context:
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
index 2f446b630168a..5e833e7ae0f4f 100644
--- a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
+++ b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; CHECK: Valid Region for Scop: bb10 => bb16
 
diff --git a/polly/test/ScopInfo/full-function.ll b/polly/test/ScopInfo/full-function.ll
index 20cb137181697..596c3d0af66a9 100644
--- a/polly/test/ScopInfo/full-function.ll
+++ b/polly/test/ScopInfo/full-function.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-detect-full-functions < %s 2>&1 | FileCheck %s -check-prefix=FULL
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=WITHOUT-FULL
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-detect-full-functions < %s 2>&1 \
+; RUN: | FileCheck %s -check-prefix=FULL
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s -check-prefix=WITHOUT-FULL
 
 ; FULL:      Region: %bb---FunctionExit
 ; FULL:      Statements {
diff --git a/polly/test/ScopInfo/granularity_same_name.ll b/polly/test/ScopInfo/granularity_same_name.ll
index 638b09879ce39..17f75fbf8a979 100644
--- a/polly/test/ScopInfo/granularity_same_name.ll
+++ b/polly/test/ScopInfo/granularity_same_name.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
 ;
 ; Check that the statement has the same name, regardless of how the
 ; basic block is split into multiple statements.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll
index f4d864d2c6543..5c4484f9d4579 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has the instructions of the two statements interleaved, such that
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
index f2c37f6293d62..7ae0d961b38fb 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
index f7bd882da96e2..7839e51c163ae 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
index 80aa9fb6deb7c..8643e85e05593 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has an independent statement just for PHI writes.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
index 66ef9fa9429e9..bc71cbe45cd98 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the PHI Write of value that is defined in the same basic
 ; block is in the statement where it is defined.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
index 3837219e5d818..f3864bac519b9 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case has no explicit epilogue for PHI writes because it would
 ; have a scalar dependency to the previous statement.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
index c43ad76d079d8..43101a8a0abfc 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case should be split into two statements because {X[0], Y[0]}
 ; and {A[0], B[0]} do not intersect.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
index cfa7739d743f7..4974f7e9b28ca 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case cannot be split into two statements because the order of
 ; loads and store would be violated.
diff --git a/polly/test/ScopInfo/i1_params.ll b/polly/test/ScopInfo/i1_params.ll
index cf5b533c02682..be3e287372017 100644
--- a/polly/test/ScopInfo/i1_params.ll
+++ b/polly/test/ScopInfo/i1_params.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that both a signed as well as an unsigned extended i1 parameter
 ; is represented correctly.
diff --git a/polly/test/ScopInfo/infeasible-rtc.ll b/polly/test/ScopInfo/infeasible-rtc.ll
index 9221ddf5fc910..7a0bfe0fa4d84 100644
--- a/polly/test/ScopInfo/infeasible-rtc.ll
+++ b/polly/test/ScopInfo/infeasible-rtc.ll
@@ -1,6 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
+; RUN:  | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  | FileCheck %s -check-prefix=SCOPS
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/infeasible_invalid_context.ll b/polly/test/ScopInfo/infeasible_invalid_context.ll
index 7ab6477460721..006901ab05b79 100644
--- a/polly/test/ScopInfo/infeasible_invalid_context.ll
+++ b/polly/test/ScopInfo/infeasible_invalid_context.ll
@@ -1,6 +1,8 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
+; RUN:  | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  | FileCheck %s -check-prefix=SCOPS
 
 ; DETECT: Valid Region for Scop: if.end116 => for.inc216
 ; SCOPS-NOT: Statements
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll
index adefe794561c2..578015aeecdc5 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
index a88fcdc0f9b12..627524c0327dd 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *B, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/integers.ll b/polly/test/ScopInfo/integers.ll
index 5f89243be0e3b..4f6d1117e2bcc 100644
--- a/polly/test/ScopInfo/integers.ll
+++ b/polly/test/ScopInfo/integers.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we correctly convert integers to isl values.
 
diff --git a/polly/test/ScopInfo/inter-error-bb-dependence.ll b/polly/test/ScopInfo/inter-error-bb-dependence.ll
index 0829f34be9791..761fcbbe3435e 100644
--- a/polly/test/ScopInfo/inter-error-bb-dependence.ll
+++ b/polly/test/ScopInfo/inter-error-bb-dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 > /dev/null | FileCheck %s
 ;
 ; Error statements (%bb33) do not require their uses to be verified.
 ; In this case it uses %tmp32 from %bb31 which is not available because
diff --git a/polly/test/ScopInfo/inter_bb_scalar_dep.ll b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
index f6406640dd2d8..7313618b082bc 100644
--- a/polly/test/ScopInfo/inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
index 3150204cd9549..d2ed3c17fe9dd 100644
--- a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
+++ b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop__TO__backedge
diff --git a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
index b0b63658caa55..b3286cd2a7240 100644
--- a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
index 0ef6b2d35106b..86855e7499a51 100644
--- a/polly/test/ScopInfo/intra_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll
index e17d06f753a21..e6d9e733e35bf 100644
--- a/polly/test/ScopInfo/intrinsics.ll
+++ b/polly/test/ScopInfo/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we remove the ignored intrinsics from the instruction list.
 ;
diff --git a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
index d3439d8d33662..723942668d8c2 100644
--- a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
+++ b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; This crashed at some point as we place %1 and %4 in the same equivalence class
 ; for invariant loads and when we remap SCEVs to use %4 instead of %1 AddRec SCEVs
diff --git a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
index ff5b0f601d03f..c493c22af32d9 100644
--- a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
+++ b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check that no invalidated iterator is accessed while elements from
 ; the list of MemoryAccesses are removed.
diff --git a/polly/test/ScopInfo/invariant-load-instlist.ll b/polly/test/ScopInfo/invariant-load-instlist.ll
index 1ec36e6d9d1b9..ecb80e4054c35 100644
--- a/polly/test/ScopInfo/invariant-load-instlist.ll
+++ b/polly/test/ScopInfo/invariant-load-instlist.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; The load is a required invariant load and at the same time used in a store.
 ; Polly used to add two MemoryAccesses for it which caused an assertion to fail.
diff --git a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
index 2d14287d4df44..89eac6ce69a11 100644
--- a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
+++ b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT: 	Stmt_L_4
diff --git a/polly/test/ScopInfo/invariant_load.ll b/polly/test/ScopInfo/invariant_load.ll
index 8974b7f7fb8cb..9dc064276c40f 100644
--- a/polly/test/ScopInfo/invariant_load.ll
+++ b/polly/test/ScopInfo/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
index 7b5a7591813a6..40aa3098683b3 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
index 0c2f57dfcb1c3..287676024079c 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
index 865bd789db6fb..cb745b4920b82 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
index f63fe9cc1f7c6..fa5429d4803a8 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_addrec_sum.ll b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
index e70aa80ae6009..2e639f7d5e331 100644
--- a/polly/test/ScopInfo/invariant_load_addrec_sum.ll
+++ b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %entry.split---%if.end
 ; CHECK:     Invariant Accesses: {
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer.ll b/polly/test/ScopInfo/invariant_load_base_pointer.ll
index 1176d1ca9db85..f2539af97a0b7 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
index 81fd3b9559f43..f854b1f48ea92 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
index 7313176aceed7..5a9c5c6cabbe6 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_branch_condition.ll b/polly/test/ScopInfo/invariant_load_branch_condition.ll
index f6cadffe311e8..d12750c30ba98 100644
--- a/polly/test/ScopInfo/invariant_load_branch_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_branch_condition.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
index 76cc55767caca..34d50a18663c4 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; CHECK:      Stmt_body1
 ; CHECK-NEXT:       Domain :=
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
index 9cc9391b6bc25..51f3cf6c095ac 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; Make sure we choose a canonical element that is not the first invariant load,
 ; but the first that is an array base pointer.
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
index 7f609f9a54689..3a742bbccdf19 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; Verify that we canonicalize accesses even tough one of the accesses (even
 ; the canonical base) has a partial execution context. This is correct as
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
index 216e0760987cd..6bd8b3146e871 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; Verify that a delinearized and a not delinearized access are not
 ; canonicalized.
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
index 5da3d0ceb2d0f..cb7e5646fc2b0 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; Verify that two arrays delinearized with different sizes are not coalesced.
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
index b71a092a2d468..6f7fbacc089cb 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; Verify that arrays with different element types are not coalesced.
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
index 2c4683ea5ce96..445832822bdf0 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
+; RUN:  -polly-invariant-load-hoisting \
+; RUN:  | FileCheck %s
 
 ; Verify that nested arrays with invariant base pointers are handled correctly.
 ; Specifically, we currently do not canonicalize arrays where some accesses are
diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll
index e6ea032004a96..11e7088d68dbd 100644
--- a/polly/test/ScopInfo/invariant_load_complex_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/invariant_load_condition.ll b/polly/test/ScopInfo/invariant_load_condition.ll
index 8b1dc8be87c86..c7d7b3c9ba611 100644
--- a/polly/test/ScopInfo/invariant_load_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_dereferenceable.ll b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
index fc5527c48c411..526bdc6ddb3bd 100644
--- a/polly/test/ScopInfo/invariant_load_dereferenceable.ll
+++ b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect;scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true \
+; RUN: -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Function: foo_undereferanceable
 
diff --git a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
index b5525a8e2639e..eb148063320e7 100644
--- a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
+++ b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not consolidate the invariant loads to smp[order - 1] and
 ; smp[order - 2] in the blocks %0 and %16. While they have the same pointer
diff --git a/polly/test/ScopInfo/invariant_load_in_non_affine.ll b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
index 69a7932fd3f58..5261113f5a0cf 100644
--- a/polly/test/ScopInfo/invariant_load_in_non_affine.ll
+++ b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output \
+; RUN:   -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop
 ;
diff --git a/polly/test/ScopInfo/invariant_load_loop_ub.ll b/polly/test/ScopInfo/invariant_load_loop_ub.ll
index 9258d75f6e294..ee889e6c4d5a1 100644
--- a/polly/test/ScopInfo/invariant_load_loop_ub.ll
+++ b/polly/test/ScopInfo/invariant_load_loop_ub.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
index 50b0103b73efb..6af7caecc0b37 100644
--- a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing \
+; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Note: The order of the invariant accesses is important because A is the
 ;       base pointer of tmp3 and we will generate code in the same order as
diff --git a/polly/test/ScopInfo/invariant_load_scalar_dep.ll b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
index ae1423e1e5f05..319f24bdcb920 100644
--- a/polly/test/ScopInfo/invariant_load_scalar_dep.ll
+++ b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_stmt_domain.ll b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
index 8062d875b1174..715948062c055 100644
--- a/polly/test/ScopInfo/invariant_load_stmt_domain.ll
+++ b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that the statement domain of the invariant access
 ; is the universe. In earlier versions of Polly, we accidentally computed an
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
index 9ee4a54168a68..a6108320d5608 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Stress test for the code generation of invariant accesses.
 ;
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter.ll b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
index 5bd2c51d86fa6..e3c183aab5e26 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    void f(int *I0, int *I1, int *V) {
 ;      for (int i = 0; i < 1000; i++) {
diff --git a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
index 426c14c191dd1..b5168e912ed74 100644
--- a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
+++ b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; CHECK: Execution Context: [p_0_loaded_from_currpc] -> {  :  }
 ;
diff --git a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
index 77f74df7d7b21..85360821078dc 100644
--- a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
index f18534d5bee24..134eac22bff5c 100644
--- a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Negative test. If we assume UB[*V] to be invariant we get a cyclic
 ; dependence in the invariant loads that needs to be resolved by
diff --git a/polly/test/ScopInfo/invariant_loop_bounds.ll b/polly/test/ScopInfo/invariant_loop_bounds.ll
index dcf7f50eb27c4..f22199cfe4942 100644
--- a/polly/test/ScopInfo/invariant_loop_bounds.ll
+++ b/polly/test/ScopInfo/invariant_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
index df5798638ba7c..e3292b4e4aefa 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occur in the region but actually access the same
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
index 3d8c232c75970..d69438de5817f 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occur in the region but actually access the same
diff --git a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
index 965531f20b01d..2df96faf76249 100644
--- a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
+++ b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect < %s 2>&1
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s 2>&1
 
 ; Used to fail with:
 ; ../../isl/isl_aff.c:591: position out of bounds
diff --git a/polly/test/ScopInfo/isl_trip_count_01.ll b/polly/test/ScopInfo/isl_trip_count_01.ll
index 79621ce64bbcc..480b6e9574a66 100644
--- a/polly/test/ScopInfo/isl_trip_count_01.ll
+++ b/polly/test/ScopInfo/isl_trip_count_01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: [M, N] -> { Stmt_while_body[i0] : i0 > 0 and 4i0 <= -M + N; Stmt_while_body[0] };
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_02.ll b/polly/test/ScopInfo/isl_trip_count_02.ll
index 3052299277844..b78fb838edd0f 100644
--- a/polly/test/ScopInfo/isl_trip_count_02.ll
+++ b/polly/test/ScopInfo/isl_trip_count_02.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: We do not allow unbounded loops at the moment.
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_03.ll b/polly/test/ScopInfo/isl_trip_count_03.ll
index 52fde263d6898..96df05f89bcff 100644
--- a/polly/test/ScopInfo/isl_trip_count_03.ll
+++ b/polly/test/ScopInfo/isl_trip_count_03.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Test comes from a bug (15771) or better a feature request. It was not allowed
 ; in Polly in the old domain generation as ScalarEvolution cannot figure out the
diff --git a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
index 657b8f6dc64e1..fd310ececaa38 100644
--- a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/licm_load.ll b/polly/test/ScopInfo/licm_load.ll
index 8f1cf4fa8fd91..ade640976d007 100644
--- a/polly/test/ScopInfo/licm_load.ll
+++ b/polly/test/ScopInfo/licm_load.ll
@@ -1,4 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes='loop(loop-rotate,indvars),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s
+; RUN: opt %loadNPMPolly -passes='loop-mssa(loop-rotate,indvars,licm),polly-prepare,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s
 ;
 ;    void foo(int n, float A[static const restrict n],
 ;             float B[static const restrict n], int j) {
@@ -11,30 +14,26 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define void @foo(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %j) {
 entry:
   %tmp = sext i32 %n to i64
-  %cmp1 = icmp slt i64 0, %tmp
-  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+  br label %for.cond
 
-for.body.lr.ph:                                   ; preds = %entry
+for.cond:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %cmp = icmp slt i64 %indvars.iv, %tmp
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
   %idxprom = sext i32 %j to i64
   %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
   %tmp2 = load i32, ptr %arrayidx, align 4
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.inc
-  %indvars.iv2 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
-  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv2
+  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
   store i32 %tmp2, ptr %arrayidx2, align 4
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1
-  %exitcond = icmp ne i64 %indvars.iv.next, %tmp
-  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
-
-for.cond.for.end_crit_edge:                       ; preds = %for.inc
-  br label %for.end
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.cond
 
-for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+for.end:                                          ; preds = %for.cond
   ret void
 }
 
diff --git a/polly/test/ScopInfo/licm_potential_store.ll b/polly/test/ScopInfo/licm_potential_store.ll
index cbd8e410ed7c8..8a36ee84313a2 100644
--- a/polly/test/ScopInfo/licm_potential_store.ll
+++ b/polly/test/ScopInfo/licm_potential_store.ll
@@ -1,4 +1,10 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NOLICM
+; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,polly-prepare,print<polly-function-scops>' \
+; RUN:    -tailcallopt -disable-output < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=NOLICM
+
+; RUN: opt %loadNPMPolly -passes='sroa,instcombine,simplifycfg,reassociate,loop(loop-rotate),instcombine,indvars,loop-mssa(licm),polly-prepare,print<polly-function-scops>' \
+; RUN:    -tailcallopt -disable-output < %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefix=LICM
 
 ;    void foo(int n, float A[static const restrict n], float x) {
 ;      //      (0)
@@ -11,40 +17,67 @@
 ;      // (4)
 ;    }
 
+; LICM:   Statements
 ; NOLICM: Statements
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo(i32 %n, ptr noalias nonnull %A, float %x) {
 entry:
-  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
-  %0 = add nuw i32 %smax, 1
-  br label %for.cond.1.preheader
+  %n.addr = alloca i32, align 4
+  %A.addr = alloca ptr, align 8
+  %x.addr = alloca float, align 4
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  store ptr %A, ptr %A.addr, align 8
+  store float %x, ptr %x.addr, align 4
+  %tmp = load i32, ptr %n.addr, align 4
+  %tmp1 = zext i32 %tmp to i64
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc.4, %entry
+  %tmp2 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %tmp2, 5
+  br i1 %cmp, label %for.body, label %for.end.6
 
-for.cond.1.preheader:                             ; preds = %entry, %for.end
-  %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ]
-  %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ]
+for.body:                                         ; preds = %for.cond
+  store i32 0, ptr %j, align 4
   br label %for.cond.1
 
-for.cond.1:                                       ; preds = %for.cond.1, %for.cond.1.preheader
-  %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ]
-  %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ]
-  %add = add nuw i32 %j.0, 1
-  %exitcond = icmp ne i32 %add, %0
-  br i1 %exitcond, label %for.cond.1, label %for.end
+for.cond.1:                                       ; preds = %for.inc, %for.body
+  %tmp3 = load i32, ptr %j, align 4
+  %tmp4 = load i32, ptr %n.addr, align 4
+  %cmp2 = icmp slt i32 %tmp3, %tmp4
+  br i1 %cmp2, label %for.body.3, label %for.end
+
+for.body.3:                                       ; preds = %for.cond.1
+  store float 7.000000e+00, ptr %x.addr, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body.3
+  %tmp5 = load i32, ptr %j, align 4
+  %add = add nsw i32 %tmp5, 1
+  store i32 %add, ptr %j, align 4
+  br label %for.cond.1
 
 for.end:                                          ; preds = %for.cond.1
-  %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ]
-  store float %x.addr.1.lcssa, ptr %A, align 4
-  %add5 = add nuw nsw i32 %i.05, 1
-  %exitcond6 = icmp ne i32 %add5, 5
-  br i1 %exitcond6, label %for.cond.1.preheader, label %for.end.6
+  %tmp6 = load float, ptr %x.addr, align 4
+  %tmp7 = load ptr, ptr %A.addr, align 8
+  store float %tmp6, ptr %tmp7, align 4
+  br label %for.inc.4
+
+for.inc.4:                                        ; preds = %for.end
+  %tmp8 = load i32, ptr %i, align 4
+  %add5 = add nsw i32 %tmp8, 1
+  store i32 %add5, ptr %i, align 4
+  br label %for.cond
 
-for.end.6:                                        ; preds = %for.end
+for.end.6:                                        ; preds = %for.cond
   ret void
 }
 
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare i32 @llvm.smax.i32(i32, i32) #0
-
-attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }    
+; CHECK: Statements {
+; CHECK:     Stmt_for_end
+; CHECK: }
diff --git a/polly/test/ScopInfo/licm_potential_store_mssa.ll b/polly/test/ScopInfo/licm_potential_store_mssa.ll
deleted file mode 100644
index ce785d622fcb3..0000000000000
--- a/polly/test/ScopInfo/licm_potential_store_mssa.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;scops>' -polly-print-scops -tailcallopt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=LICM
-
-;    void foo(int n, float A[static const restrict n], float x) {
-;      //      (0)
-;      for (int i = 0; i < 5; i += 1) {
-;        for (int j = 0; j < n; j += 1) {
-;          x = 7; // (1)
-;        }
-;        A[0] = x; // (3)
-;      }
-;      // (4)
-;    }
-
-; LICM:   Statements
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-define void @foo(i32 %n, ptr noalias nonnull %A, float %x) {
-entry:
-  %smax = call i32 @llvm.smax.i32(i32 %n, i32 0)
-  br label %for.cond.1.preheader
-
-for.cond.1.preheader:                             ; preds = %for.end, %entry
-  %i.05 = phi i32 [ 0, %entry ], [ %add5, %for.end ]
-  %x.addr.04 = phi float [ %x, %entry ], [ %x.addr.1.lcssa, %for.end ]
-  br label %for.cond.1
-
-for.cond.1:                                       ; preds = %for.cond.1, %for.cond.1.preheader
-  %x.addr.1 = phi float [ 7.000000e+00, %for.cond.1 ], [ %x.addr.04, %for.cond.1.preheader ]
-  %j.0 = phi i32 [ %add, %for.cond.1 ], [ 0, %for.cond.1.preheader ]
-  %add = add nuw i32 %j.0, 1
-  %exitcond.not = icmp eq i32 %j.0, %smax
-  br i1 %exitcond.not, label %for.end, label %for.cond.1
-
-for.end:                                          ; preds = %for.cond.1
-  %x.addr.1.lcssa = phi float [ %x.addr.1, %for.cond.1 ]
-  %add5 = add nuw nsw i32 %i.05, 1
-  %exitcond6.not = icmp eq i32 %add5, 5
-  br i1 %exitcond6.not, label %for.end.6, label %for.cond.1.preheader
-
-for.end.6:                                        ; preds = %for.end
-  %x.addr.1.lcssa.lcssa = phi float [ %x.addr.1.lcssa, %for.end ]
-  store float %x.addr.1.lcssa.lcssa, ptr %A, align 4
-  ret void
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare i32 @llvm.smax.i32(i32, i32) #0
-
-attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }   
diff --git a/polly/test/ScopInfo/licm_reduction_nested.ll b/polly/test/ScopInfo/licm_reduction_nested.ll
index 50625b2ddabde..c1676033fa909 100644
--- a/polly/test/ScopInfo/licm_reduction_nested.ll
+++ b/polly/test/ScopInfo/licm_reduction_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars       '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm '-passes=polly-custom<prepare;scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars       -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; XFAIL: *
 ;
diff --git a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
index 8225bd04fce63..f102518da5261 100644
--- a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
+++ b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Verify that the compilation of this test case does not take infinite time.
 ; At some point Polly tried to model this test case and got stuck in
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
index 064a0d3e700b9..e32748a4bbb57 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
index edaadd61dc020..b32b87b5c3f3a 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
index 391f0ec8c0f59..431c907857fec 100644
--- a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
+++ b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/loop_affine_bound_0.ll b/polly/test/ScopInfo/loop_affine_bound_0.ll
index fcd56613fc095..918d4099740ce 100644
--- a/polly/test/ScopInfo/loop_affine_bound_0.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_1.ll b/polly/test/ScopInfo/loop_affine_bound_1.ll
index 392509871a9b7..8f7a87f1c5ac4 100644
--- a/polly/test/ScopInfo/loop_affine_bound_1.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[][128], long N, long M) {
 ;  long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_2.ll b/polly/test/ScopInfo/loop_affine_bound_2.ll
index 665dc1ad244d9..2d9f997a0767f 100644
--- a/polly/test/ScopInfo/loop_affine_bound_2.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_carry.ll b/polly/test/ScopInfo/loop_carry.ll
index 579f43d874577..20ebbfbc8b49c 100644
--- a/polly/test/ScopInfo/loop_carry.ll
+++ b/polly/test/ScopInfo/loop_carry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/many-scalar-dependences.ll b/polly/test/ScopInfo/many-scalar-dependences.ll
index ddad36065a5c8..5b003325ef0fb 100644
--- a/polly/test/ScopInfo/many-scalar-dependences.ll
+++ b/polly/test/ScopInfo/many-scalar-dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float a[100][100]) {
 ;      float x;
diff --git a/polly/test/ScopInfo/max-loop-depth.ll b/polly/test/ScopInfo/max-loop-depth.ll
index f33933210247d..71e9c02aa8dcc 100644
--- a/polly/test/ScopInfo/max-loop-depth.ll
+++ b/polly/test/ScopInfo/max-loop-depth.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void bar();
 ;    void foo(int *A, int *B, long int N, long int M) {
diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll
index 149a2fcfea772..6c45b0d41b76b 100644
--- a/polly/test/ScopInfo/memcpy-raw-source.ll
+++ b/polly/test/ScopInfo/memcpy-raw-source.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Ensure that ScopInfo's alias analysis llvm.memcpy for,
 ; like the AliasSetTracker, preserves bitcasts.
diff --git a/polly/test/ScopInfo/memcpy.ll b/polly/test/ScopInfo/memcpy.ll
index 6b7a9e2edffbe..95c455f097b21 100644
--- a/polly/test/ScopInfo/memcpy.ll
+++ b/polly/test/ScopInfo/memcpy.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memmove.ll b/polly/test/ScopInfo/memmove.ll
index aba886b59d1d5..8ff471a11cd17 100644
--- a/polly/test/ScopInfo/memmove.ll
+++ b/polly/test/ScopInfo/memmove.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset.ll b/polly/test/ScopInfo/memset.ll
index 7eaec7bd1ad6a..89b0487728210 100644
--- a/polly/test/ScopInfo/memset.ll
+++ b/polly/test/ScopInfo/memset.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types '-passes=polly<no-default-opts>' < %s 2>&1 | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset_null.ll b/polly/test/ScopInfo/memset_null.ll
index 7bd3e90b3aa82..9755cf1129e68 100644
--- a/polly/test/ScopInfo/memset_null.ll
+++ b/polly/test/ScopInfo/memset_null.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S '-passes=polly<no-default-opts>' < %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S -passes=polly-codegen < %s
 ;
 ; Verify we can handle a memset to "null" and that we do not model it.
 ; TODO: FIXME: We could use the undefined memset to optimize the code further,
diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll
index cd12421344f7f..f825cbff1ec56 100644
--- a/polly/test/ScopInfo/mismatching-array-dimensions.ll
+++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: AssumedContext
 
diff --git a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
index 1e289425e86d7..6bc5f8d8eb73f 100644
--- a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly<no-default-opts>' -polly-allow-modref-calls -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb -passes=polly-codegen -polly-allow-modref-calls \
+; RUN:     -disable-output < %s
 ;
 ; Verify that we model the may-write access of the prefetch intrinsic
 ; correctly, thus that A is accessed by it but B is not.
diff --git a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
index 0b6e64da437fd..21322bc648f8e 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
+; RUN:     -polly-allow-modref-calls < %s
 ;
 ; Verify that we model the read access of the gcread intrinsic
 ; correctly, thus that A is read by it but B is not.
diff --git a/polly/test/ScopInfo/mod_ref_read_pointer.ll b/polly/test/ScopInfo/mod_ref_read_pointer.ll
index 25d59d9f7fd16..25e56a08a961b 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointer.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls -passes=polly-codegen -disable-output < %s
 ;
 ; Check that we assume the call to func has a read on the whole A array.
 ;
diff --git a/polly/test/ScopInfo/mod_ref_read_pointers.ll b/polly/test/ScopInfo/mod_ref_read_pointers.ll
index f8cbb084aefe8..5cc96cf3a06eb 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointers.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointers.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly<no-default-opts>' -disable-output -polly-allow-modref-calls < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
+; RUN:     -polly-allow-modref-calls < %s
 ;
 ; Check that the call to func will "read" not only the A array but also the
 ; B array. The reason is the readonly annotation of func.
diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll
index a9b53d53aea7e..0a8957da4931a 100644
--- a/polly/test/ScopInfo/modulo_zext_1.ll
+++ b/polly/test/ScopInfo/modulo_zext_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_2.ll b/polly/test/ScopInfo/modulo_zext_2.ll
index f86ddcea9fe2b..7af2411e7e8c4 100644
--- a/polly/test/ScopInfo/modulo_zext_2.ll
+++ b/polly/test/ScopInfo/modulo_zext_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_3.ll b/polly/test/ScopInfo/modulo_zext_3.ll
index 21596d16a6e14..1dac723aa2c23 100644
--- a/polly/test/ScopInfo/modulo_zext_3.ll
+++ b/polly/test/ScopInfo/modulo_zext_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/multi-scop.ll b/polly/test/ScopInfo/multi-scop.ll
index 8647d89c91d7a..c6dc1f201efa2 100644
--- a/polly/test/ScopInfo/multi-scop.ll
+++ b/polly/test/ScopInfo/multi-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>'  -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; This test case contains two scops.
diff --git a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
index 8785458e42f2c..bd46532d87f10 100644
--- a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
+++ b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
index 5de07bad6bd06..cdd46304c932b 100644
--- a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
+++ b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
index 984f41cd1e9bf..0b735b9106189 100644
--- a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
index 96b822ad4aa86..befca87972c19 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
@@ -1,5 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
index c04cc200e06bd..cceb5353d74c0 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
@@ -1,5 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
index 2abd37c9f82d0..c957dd10ed652 100644
--- a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
index 47cbc0bb1c534..4a1ee3b1af51d 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    #define N 400
 ;
diff --git a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
index e82869616d63c..9a6d8fbe12755 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim.ll b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
index dde847bb8d4d7..9f47694022868 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    struct com {
 ;      double Real;
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
index 84222f73b7c6d..5778126ad8f17 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -debug -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopInfo/multidim_fortran_2d.ll b/polly/test/ScopInfo/multidim_fortran_2d.ll
index 10314606a8123..e5b005f17dcc7 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_params.ll b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
index 992df969f9cc2..a7f7ebc130362 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_params.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-precise-fold-accesses -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-precise-fold-accesses \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
index 79fd4c286745e..5f3080a12fdbe 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
@@ -1,5 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-modref-calls -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -polly-invariant-load-hoisting=true -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll
index 62ff184f7a6b6..31cc633fa65c6 100644
--- a/polly/test/ScopInfo/multidim_fortran_srem.ll
+++ b/polly/test/ScopInfo/multidim_fortran_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; CHECK:      Statements {
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast.ll b/polly/test/ScopInfo/multidim_gep_pointercast.ll
index aa7932fb737f0..fd8048b11f14b 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The load access to A has a pointer-bitcast to another elements size before the
 ; GetElementPtr. Verify that we do not the GEP delinearization because it
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
index 0475506fa9f1a..9daae4b1ce3db 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we do not use the GetElementPtr information to delinearize A
 ; because of the cast in-between. Use the single-dimensional modeling instead.
diff --git a/polly/test/ScopInfo/multidim_invalid_dimension.ll b/polly/test/ScopInfo/multidim_invalid_dimension.ll
index 1cf79f1bd8de1..e1ec2e1ce3be0 100644
--- a/polly/test/ScopInfo/multidim_invalid_dimension.ll
+++ b/polly/test/ScopInfo/multidim_invalid_dimension.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnueabi"
 
diff --git a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
index 7779748c8c7f6..92b42a9e7a870 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
index 49e0a9b60657b..261cba1e68aad 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) {
diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll
index a4edc9e725ac4..f0f1c2b1f39db 100644
--- a/polly/test/ScopInfo/multidim_many_references.ll
+++ b/polly/test/ScopInfo/multidim_many_references.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_nested_start_integer.ll b/polly/test/ScopInfo/multidim_nested_start_integer.ll
index c98aece41a9e1..6ee9798a050d7 100644
--- a/polly/test/ScopInfo/multidim_nested_start_integer.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_integer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
index 12c8d97f5d63b..e238bddf4783b 100644
--- a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_2d.ll b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
index a9685d12eb178..33b321716edc3 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_2d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d.ll b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
index bb9c302eaf06a..39ea4243d9426 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
index 7f0c8b12be9ba..7f7f7f91067e2 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(int n, int m, int o, double A[n][m][o]) {
 ;
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
index 797a037a6770e..1675110ffd6f1 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; This test case checks for array access functions where the order in which the
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
index 3a21702b36727..da9827fd5f2c6 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, long m, float A[][n][m]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript.ll b/polly/test/ScopInfo/multidim_param_in_subscript.ll
index cc3fa87c8ba04..c86b5f0ae2386 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void foo(long n, float A[][n]) {
diff --git a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
index 117671ddc6a22..da563a05560cd 100644
--- a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
+++ b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A, long *p) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
index 5ebe0daaec470..7059e5396987b 100644
--- a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
+++ b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll
index 5c1b0ea7e6150..88c8c6af648e0 100644
--- a/polly/test/ScopInfo/multidim_srem.ll
+++ b/polly/test/ScopInfo/multidim_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, float A[][n][n]) {
 ;      for (long i = 0; i < 200; i++)
diff --git a/polly/test/ScopInfo/multidim_with_bitcast.ll b/polly/test/ScopInfo/multidim_with_bitcast.ll
index 941ec637dba3d..0ab9c2d93ff46 100644
--- a/polly/test/ScopInfo/multidim_with_bitcast.ll
+++ b/polly/test/ScopInfo/multidim_with_bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multiple-binary-or-conditions.ll b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
index ecfc0012fd59f..65416e6fffda3 100644
--- a/polly/test/ScopInfo/multiple-binary-or-conditions.ll
+++ b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
index 9ae664fd497c8..910e624adb50a 100644
--- a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
+++ b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
+; RUN:     -polly-allow-differing-element-types \
+; RUN:     -disable-output < %s  2>&1 | FileCheck %s
 ;
 ;    // For the following accesses the offset expression from the base pointer
 ;    // is not always a multiple of the type size.
diff --git a/polly/test/ScopInfo/multiple-types-non-affine-2.ll b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
index 6530dbf8d75be..cb0630da1b2e6 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-affine.ll b/polly/test/ScopInfo/multiple-types-non-affine.ll
index 7f5f995fd6d26..7349c5ae48ba2 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=polly<no-default-opts>' -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
index 5890a5a2ea3bf..df280c88f8668 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
index 3e8390aad300f..b9494187d0ff3 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
index 4e71f9b5dd66b..e971ccc0ba448 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
+; RUN:                -polly-allow-differing-element-types \
+; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void foo(long n, long m, char A[][m]) {
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional.ll b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
index 9899fe4bde7ed..34179508cae89 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
+; RUN:     -polly-allow-differing-element-types \
+; RUN:     -disable-output < %s  2>&1 | FileCheck %s
 ;
 ;    void foo(long n, long m, char A[][m]) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/ScopInfo/multiple-types.ll b/polly/test/ScopInfo/multiple-types.ll
index 753386575d33a..84d7d3349e29d 100644
--- a/polly/test/ScopInfo/multiple-types.ll
+++ b/polly/test/ScopInfo/multiple-types.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN: -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks.ll b/polly/test/ScopInfo/multiple_exiting_blocks.ll
index 218e5c4108c90..b0c425ee62cc4 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
index d3a70fdb96130..ff0ec47be1c58 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_latch_blocks.ll b/polly/test/ScopInfo/multiple_latch_blocks.ll
index 0aa25f4ad70f6..e5085daa2ca16 100644
--- a/polly/test/ScopInfo/multiple_latch_blocks.ll
+++ b/polly/test/ScopInfo/multiple_latch_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   [N, P] -> { Stmt_if_end[i0] : 0 <= i0 < N and (i0 > P or i0 < P) };
diff --git a/polly/test/ScopInfo/nested-loops.ll b/polly/test/ScopInfo/nested-loops.ll
index 7998a3896d9d4..91002979f4fa4 100644
--- a/polly/test/ScopInfo/nested-loops.ll
+++ b/polly/test/ScopInfo/nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
index f1ad40baf33ea..df010846bed20 100644
--- a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
+++ b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not generate any scalar dependences regarding x. It is
 ; defined and used on the non-affine subregion only, thus we do not need
diff --git a/polly/test/ScopInfo/non-affine-region-phi.ll b/polly/test/ScopInfo/non-affine-region-phi.ll
index 0248004c27f50..3fb655e60f1c0 100644
--- a/polly/test/ScopInfo/non-affine-region-phi.ll
+++ b/polly/test/ScopInfo/non-affine-region-phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s --check-prefix=CODE
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -S < %s 2>&1 | FileCheck %s --check-prefix=CODE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify there is a phi in the non-affine region but it is not represented in
 ; the SCoP as all operands as well as the uses are inside the region too.
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
index 158fe772c6d29..4c3ca4d21447d 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=polly-custom<detect>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Stmt_loop3
 ; CHECK:            Domain :=
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop.ll b/polly/test/ScopInfo/non-affine-region-with-loop.ll
index bcb542f2cbf70..f4c028ac23409 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly<no-default-opts>' -disable-output
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -passes=polly-codegen -disable-output
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_loop2__TO__loop[] };
diff --git a/polly/test/ScopInfo/non-precise-inv-load-1.ll b/polly/test/ScopInfo/non-precise-inv-load-1.ll
index d100b514a0be3..d55344b355f13 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-1.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do hoist the invariant access to I with a execution context
 ; as the address computation might wrap in the original but not in our
diff --git a/polly/test/ScopInfo/non-precise-inv-load-2.ll b/polly/test/ScopInfo/non-precise-inv-load-2.ll
index fad8fcd918446..79ef3b88cb4f0 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-2.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ; CHECK:       Invariant Accesses: {
diff --git a/polly/test/ScopInfo/non-precise-inv-load-3.ll b/polly/test/ScopInfo/non-precise-inv-load-3.ll
index d032644c9e5ff..aa92847661165 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-3.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Invariant Accesses: {
 ; CHECK-NEXT:     ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/non-precise-inv-load-4.ll b/polly/test/ScopInfo/non-precise-inv-load-4.ll
index c1ba7ddc62584..2a2241cb5a993 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-4.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we hoist I[0] without execution context even though it
 ; is executed in a statement with an invalid domain.
diff --git a/polly/test/ScopInfo/non-precise-inv-load-5.ll b/polly/test/ScopInfo/non-precise-inv-load-5.ll
index c188b5f74b1e9..a414c7c0fed17 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-5.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not hoist I[c] without execution context because it
 ; is executed in a statement with an invalid domain and it depends
diff --git a/polly/test/ScopInfo/non-precise-inv-load-6.ll b/polly/test/ScopInfo/non-precise-inv-load-6.ll
index b1c19745f1424..1300617f00eeb 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-6.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we model the execution context correctly.
 ;
diff --git a/polly/test/ScopInfo/non-pure-function-call.ll b/polly/test/ScopInfo/non-pure-function-call.ll
index ad69141a12c66..81d43db5c3522 100644
--- a/polly/test/ScopInfo/non-pure-function-call.ll
+++ b/polly/test/ScopInfo/non-pure-function-call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT:   [N] -> {  :  }
diff --git a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
index 38e1c03a35227..6cbb41041be88 100644
--- a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Error blocks are skipped during SCoP detection. We skip them during
 ; SCoP formation too as they might contain instructions we can not handle.
diff --git a/polly/test/ScopInfo/non-pure-function-calls.ll b/polly/test/ScopInfo/non-pure-function-calls.ll
index d45c32ede7088..f97644052272d 100644
--- a/polly/test/ScopInfo/non-pure-function-calls.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Allow the user to define function names that are treated as
 ; error functions and assumed not to be executed.
diff --git a/polly/test/ScopInfo/non_affine_access.ll b/polly/test/ScopInfo/non_affine_access.ll
index 0f5d9e7c43e4e..0338edf053297 100644
--- a/polly/test/ScopInfo/non_affine_access.ll
+++ b/polly/test/ScopInfo/non_affine_access.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long *A) {
diff --git a/polly/test/ScopInfo/non_affine_region_1.ll b/polly/test/ScopInfo/non_affine_region_1.ll
index 5934962f81567..8980a711b325d 100644
--- a/polly/test/ScopInfo/non_affine_region_1.ll
+++ b/polly/test/ScopInfo/non_affine_region_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify only the incoming scalar x is modeled as a read in the non-affine
 ; region.
diff --git a/polly/test/ScopInfo/non_affine_region_2.ll b/polly/test/ScopInfo/non_affine_region_2.ll
index aa083616cac8e..b2e072f7a3bfa 100644
--- a/polly/test/ScopInfo/non_affine_region_2.ll
+++ b/polly/test/ScopInfo/non_affine_region_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_3.ll b/polly/test/ScopInfo/non_affine_region_3.ll
index b7c4c1b9bd545..d850cb5c95aad 100644
--- a/polly/test/ScopInfo/non_affine_region_3.ll
+++ b/polly/test/ScopInfo/non_affine_region_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_4.ll b/polly/test/ScopInfo/non_affine_region_4.ll
index 12cda0a53fb3b..c5309734a668e 100644
--- a/polly/test/ScopInfo/non_affine_region_4.ll
+++ b/polly/test/ScopInfo/non_affine_region_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that both scalars (x and y) are properly written in the non-affine
 ; region and read afterwards.
diff --git a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
index a52aae0d59168..b1ce00f0df94e 100644
--- a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
+++ b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_while_cond_i__TO__while_end_i[] };
diff --git a/polly/test/ScopInfo/not-a-reduction.ll b/polly/test/ScopInfo/not-a-reduction.ll
index 84f6564ae4a2e..3a961b2dc1719 100644
--- a/polly/test/ScopInfo/not-a-reduction.ll
+++ b/polly/test/ScopInfo/not-a-reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
 
 ;#define TYPE float
 ;#define NUM 4
diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll
index 23b9d3caf741d..f4f79525069e5 100644
--- a/polly/test/ScopInfo/opaque-struct.ll
+++ b/polly/test/ScopInfo/opaque-struct.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check that we do not crash with unsized (opaque) types.
 ;
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
index e069ccac55340..eed27b1c4d9dd 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
 ;
 ; Check whether %newval is identified as escaping value, even though it is used
 ; in a phi that is in the region. Non-affine subregion case.
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
index 27ea11a23a3fe..44da399e704d8 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 1]
 ; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] };
diff --git a/polly/test/ScopInfo/parameter-constant-division.ll b/polly/test/ScopInfo/parameter-constant-division.ll
index aaad0dfb2ee60..e5dd359158b8b 100644
--- a/polly/test/ScopInfo/parameter-constant-division.ll
+++ b/polly/test/ScopInfo/parameter-constant-division.ll
@@ -1,4 +1,6 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN:     -polly-invariant-load-hoisting=true \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/parameter_in_dead_statement.ll b/polly/test/ScopInfo/parameter_in_dead_statement.ll
index 444f9a9c24b4e..b295f17f628af 100644
--- a/polly/test/ScopInfo/parameter_in_dead_statement.ll
+++ b/polly/test/ScopInfo/parameter_in_dead_statement.ll
@@ -1,5 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -S -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; Verify we do not create assumptions based on the parameter p_1 which is the
 ; load %0 and due to error-assumptions not "part of the SCoP".
diff --git a/polly/test/ScopInfo/parameter_product.ll b/polly/test/ScopInfo/parameter_product.ll
index 9e6e3d0e1446e..2fe16f9d95f6d 100644
--- a/polly/test/ScopInfo/parameter_product.ll
+++ b/polly/test/ScopInfo/parameter_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int n, m;
 ; void foo(char* __restrict a)
diff --git a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
index 20986d17b8f0d..6544aaec76f74 100644
--- a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
+++ b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the access function of the store is simple and concise
 ;
diff --git a/polly/test/ScopInfo/partially_invariant_load_1.ll b/polly/test/ScopInfo/partially_invariant_load_1.ll
index 8d62f156a4394..f3923f6127cdd 100644
--- a/polly/test/ScopInfo/partially_invariant_load_1.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts>' -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:             ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/partially_invariant_load_2.ll b/polly/test/ScopInfo/partially_invariant_load_2.ll
index 48580907b2f0b..d0d74ad99e09b 100644
--- a/polly/test/ScopInfo/partially_invariant_load_2.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not try to preload *I and assume p != 42.
 ;
diff --git a/polly/test/ScopInfo/phi-in-non-affine-region.ll b/polly/test/ScopInfo/phi-in-non-affine-region.ll
index 6d98a6813862e..fbbc158b566bb 100644
--- a/polly/test/ScopInfo/phi-in-non-affine-region.ll
+++ b/polly/test/ScopInfo/phi-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that 'tmp' is stored in bb1 and read by bb3, as it is needed as
 ; incoming value for the tmp11 PHI node.
diff --git a/polly/test/ScopInfo/phi_after_error_block.ll b/polly/test/ScopInfo/phi_after_error_block.ll
index 251be099c1f49..a1eadff3e9717 100644
--- a/polly/test/ScopInfo/phi_after_error_block.ll
+++ b/polly/test/ScopInfo/phi_after_error_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @bar()
 
diff --git a/polly/test/ScopInfo/phi_condition_modeling_1.ll b/polly/test/ScopInfo/phi_condition_modeling_1.ll
index bd5c51e968ff5..a889ec96a4b12 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_1.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_condition_modeling_2.ll b/polly/test/ScopInfo/phi_condition_modeling_2.ll
index 281b8d33b7756..b56b77e1f4534 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_2.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_conditional_simple_1.ll b/polly/test/ScopInfo/phi_conditional_simple_1.ll
index 6d7f0e9484113..14fdc38201bc8 100644
--- a/polly/test/ScopInfo/phi_conditional_simple_1.ll
+++ b/polly/test/ScopInfo/phi_conditional_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/phi_loop_carried_float.ll b/polly/test/ScopInfo/phi_loop_carried_float.ll
index 2e62dcd5799a3..76e5507f24b06 100644
--- a/polly/test/ScopInfo/phi_loop_carried_float.ll
+++ b/polly/test/ScopInfo/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/ScopInfo/phi_not_grouped_at_top.ll b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
index 57d02f24f781b..c97d9a27b24b7 100644
--- a/polly/test/ScopInfo/phi_not_grouped_at_top.ll
+++ b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare>' -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-prepare -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @funa() align 2
diff --git a/polly/test/ScopInfo/phi_scalar_simple_1.ll b/polly/test/ScopInfo/phi_scalar_simple_1.ll
index 600c94e1d9b4c..ffd1a37f8a79f 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_1.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context should be empty since the <nsw> flags on the IV
 ; increments already guarantee that there is no wrap in the loop trip
diff --git a/polly/test/ScopInfo/phi_scalar_simple_2.ll b/polly/test/ScopInfo/phi_scalar_simple_2.ll
index d3353ddc5e4e8..0d6d9029c61c3 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_2.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/phi_with_invoke_edge.ll b/polly/test/ScopInfo/phi_with_invoke_edge.ll
index 1b01a98fca06a..9c98ec0c603cf 100644
--- a/polly/test/ScopInfo/phi_with_invoke_edge.ll
+++ b/polly/test/ScopInfo/phi_with_invoke_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @generic_personality_v0(i32, i64, ptr, ptr)
diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
index 1b983ace1b6a4..18ba18c69f1f9 100644
--- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
+++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B) {
 ;      while (A != B) {
diff --git a/polly/test/ScopInfo/pointer-comparison.ll b/polly/test/ScopInfo/pointer-comparison.ll
index f80c4978669c4..846640ac630ff 100644
--- a/polly/test/ScopInfo/pointer-comparison.ll
+++ b/polly/test/ScopInfo/pointer-comparison.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: FIXME: Investigate why we need a InvalidContext here.
 ;
diff --git a/polly/test/ScopInfo/pointer-type-expressions.ll b/polly/test/ScopInfo/pointer-type-expressions.ll
index 0fdd0bea6f219..89dce6536a107 100644
--- a/polly/test/ScopInfo/pointer-type-expressions.ll
+++ b/polly/test/ScopInfo/pointer-type-expressions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
index 8ad531d93d290..7b6d0d542581b 100644
--- a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
+++ b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; In this test case we pass a pointer %A into a PHI node and also use this
 ; pointer as base pointer of an array store. As a result, we get both scalar
diff --git a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
index 7dfa1ec7905ba..13087a517501a 100644
--- a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
+++ b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:  	Stmt_bb9
diff --git a/polly/test/ScopInfo/pr38218.ll b/polly/test/ScopInfo/pr38218.ll
index 2c22b1464876d..74103f9a2ac38 100644
--- a/polly/test/ScopInfo/pr38218.ll
+++ b/polly/test/ScopInfo/pr38218.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; This code causes the SCoP to be rejected because of an ERRORBLOCK
 ; assumption and made Polly crash (llvm.org/PR38219).
diff --git a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
index 800b0339a1422..33fa0126aa30e 100644
--- a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
+++ b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/process_added_dimensions.ll b/polly/test/ScopInfo/process_added_dimensions.ll
index 9cb932eeef18a..2d06f4b995976 100644
--- a/polly/test/ScopInfo/process_added_dimensions.ll
+++ b/polly/test/ScopInfo/process_added_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/pwaff-complexity-bailout.ll b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
index 62909f8c3e4c5..931e08fb8f2fc 100644
--- a/polly/test/ScopInfo/pwaff-complexity-bailout.ll
+++ b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops '-pass-remarks-analysis=.*' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s
 
 ; Make sure we hit the complexity bailout, and don't crash.
 ; CHECK: Low complexity assumption:       {  : false }
diff --git a/polly/test/ScopInfo/ranged_parameter.ll b/polly/test/ScopInfo/ranged_parameter.ll
index a6e51c7f2048c..03562b1fd1245 100644
--- a/polly/test/ScopInfo/ranged_parameter.ll
+++ b/polly/test/ScopInfo/ranged_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_2.ll b/polly/test/ScopInfo/ranged_parameter_2.ll
index 554dd6e38cd00..18cbbf3b87cd6 100644
--- a/polly/test/ScopInfo/ranged_parameter_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_2.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true -debug < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \
+; RUN:  -debug 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap.ll b/polly/test/ScopInfo/ranged_parameter_wrap.ll
index 7ae15c34c94c6..d236eeeefc11c 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; __wrapping__ range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
index 00c3caa9c50ce..fc0a737a5edbe 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the context is built fast and does not explode due to us
 ; combining a large number of non-convex ranges. Instead, after a certain
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
index 528dbb102ecb0..7e6f2406a0ac8 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
index 6bc1fe71f35f2..18e6c1fac9e15 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalars.ll b/polly/test/ScopInfo/read-only-scalars.ll
index 7c78d621930c5..f04163e480284 100644
--- a/polly/test/ScopInfo/read-only-scalars.ll
+++ b/polly/test/ScopInfo/read-only-scalars.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS
 
 ; CHECK-NOT: Memref_scalar
 
diff --git a/polly/test/ScopInfo/read-only-statements.ll b/polly/test/ScopInfo/read-only-statements.ll
index c1cb618a45f64..7bac53a2b6b51 100644
--- a/polly/test/ScopInfo/read-only-statements.ll
+++ b/polly/test/ScopInfo/read-only-statements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check we remove read only statements.
 ;
diff --git a/polly/test/ScopInfo/reduction_alternating_base.ll b/polly/test/ScopInfo/reduction_alternating_base.ll
index 474c6ac64ffc1..e38ff6046ac01 100644
--- a/polly/test/ScopInfo/reduction_alternating_base.ll
+++ b/polly/test/ScopInfo/reduction_alternating_base.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
index e91eeaf544a05..17f9dc57f2823 100644
--- a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
+++ b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/reduction_different_index.ll b/polly/test/ScopInfo/reduction_different_index.ll
index 5c169f71f4fe8..d2786d5fd6779 100644
--- a/polly/test/ScopInfo/reduction_different_index.ll
+++ b/polly/test/ScopInfo/reduction_different_index.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A,int *sum) {
diff --git a/polly/test/ScopInfo/reduction_different_index1.ll b/polly/test/ScopInfo/reduction_different_index1.ll
index 93ab77be84de9..710ae3e74f21a 100644
--- a/polly/test/ScopInfo/reduction_different_index1.ll
+++ b/polly/test/ScopInfo/reduction_different_index1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A, int *sum, int i1, int i2) {
diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
index 618e4d3ab3f98..61228e075dabe 100644
--- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
+++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: +
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_sum[0] };
diff --git a/polly/test/ScopInfo/reduction_double.ll b/polly/test/ScopInfo/reduction_double.ll
index a7721d1b42e46..d126d3d833ee1 100644
--- a/polly/test/ScopInfo/reduction_double.ll
+++ b/polly/test/ScopInfo/reduction_double.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
 ;
 ; Verify if two independent reductions in same loop is detected
 ;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate.ll b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
index 86923458ee773..c66a8be0852fa 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
index 641d2e7337e77..c574d315b2fe1 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
index dd2a76ebbd368..92a071ea1c372 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_if.ll b/polly/test/ScopInfo/reduction_if.ll
index 53a62a3b857e9..4f7d3681e0a0b 100644
--- a/polly/test/ScopInfo/reduction_if.ll
+++ b/polly/test/ScopInfo/reduction_if.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
 ;
 ; Verify if reduction spread across multiple blocks in a single scop statement are detected
 ;
diff --git a/polly/test/ScopInfo/reduction_indirect_access.ll b/polly/test/ScopInfo/reduction_indirect_access.ll
index cb54cd9581368..7acac4b150f40 100644
--- a/polly/test/ScopInfo/reduction_indirect_access.ll
+++ b/polly/test/ScopInfo/reduction_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ; CHECK: MemRef_INDICES[i0]
diff --git a/polly/test/ScopInfo/reduction_indirect_access_2.ll b/polly/test/ScopInfo/reduction_indirect_access_2.ll
index 5642a8470f124..331953991d86c 100644
--- a/polly/test/ScopInfo/reduction_indirect_access_2.ll
+++ b/polly/test/ScopInfo/reduction_indirect_access_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output -polly-allow-nonaffine < %s | FileCheck %s
 ;
 ; Validate that the accesses to INDICES[i] is not part of a reduction.
 ;
diff --git a/polly/test/ScopInfo/reduction_invalid_different_operators.ll b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
index 9e6b3cd431083..9846f1029c087 100644
--- a/polly/test/ScopInfo/reduction_invalid_different_operators.ll
+++ b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int f() {
 ;   int i, sum = 0, sth = 0;
diff --git a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
index 7ae7d8ed3ffa2..4d70e53304556 100644
--- a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
+++ b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int *sums) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain.ll b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
index 6f2f48005bdac..62ae1fef187b6 100644
--- a/polly/test/ScopInfo/reduction_long_reduction_chain.ll
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ; CHECK: MemRef_sum
diff --git a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
index 2fd71c28d5211..7ca46fa9535ac 100644
--- a/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
+++ b/polly/test/ScopInfo/reduction_long_reduction_chain_double_use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; Sum is added twice in the statement. Hence no reduction.
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_multiple_different_operators.ll b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
index 4f049a3505b09..b77c72a291744 100644
--- a/polly/test/ScopInfo/reduction_multiple_different_operators.ll
+++ b/polly/test/ScopInfo/reduction_multiple_different_operators.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
 ;
 ; Should not be identified as reduction as there are different operations
 ; involved on sum (multiplication followed by addition)
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
index 0d016674ffc08..800eb2043dc62 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: *
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
index 568513aedfa10..49ebdcb044988 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
index 0ac50b3b92c47..77b71f4df301b 100644
--- a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
+++ b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: NONE
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_A[1 + i0] };
diff --git a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
index f01b641b17f64..61aaa051e49d1 100644
--- a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
+++ b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ; CHECK: Reduction Type: +
diff --git a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
index 51685dca8b7da..fb6d236764b74 100644
--- a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_fp.ll b/polly/test/ScopInfo/reduction_simple_fp.ll
index 67139bba2fded..aa4cd00f39f59 100644
--- a/polly/test/ScopInfo/reduction_simple_fp.ll
+++ b/polly/test/ScopInfo/reduction_simple_fp.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Function: f_no_fast_math
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_simple_w_constant.ll b/polly/test/ScopInfo/reduction_simple_w_constant.ll
index c17184624c066..e385b66f9db21 100644
--- a/polly/test/ScopInfo/reduction_simple_w_constant.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_w_iv.ll b/polly/test/ScopInfo/reduction_simple_w_iv.ll
index 7cc50bfe78906..e22eccbb2831d 100644
--- a/polly/test/ScopInfo/reduction_simple_w_iv.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_two_identical_reads.ll b/polly/test/ScopInfo/reduction_two_identical_reads.ll
index 35cb9dfcdb122..8f00954f7efc3 100644
--- a/polly/test/ScopInfo/reduction_two_identical_reads.ll
+++ b/polly/test/ScopInfo/reduction_two_identical_reads.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll
index 7512da420af0e..ad71f1f59e18b 100644
--- a/polly/test/ScopInfo/redundant_parameter_constraint.ll
+++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The constraint that r2 has to be bigger than r1 is implicitly contained in
 ; the domain, hence we do not want to see it explicitly.
diff --git a/polly/test/ScopInfo/region-with-instructions.ll b/polly/test/ScopInfo/region-with-instructions.ll
index 38d58c97e1b05..d4720511b7aad 100644
--- a/polly/test/ScopInfo/region-with-instructions.ll
+++ b/polly/test/ScopInfo/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Statements {
 ; CHECK: 	Stmt_bb46
diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll
index 2d6ace988659d..10cc57aa27a14 100644
--- a/polly/test/ScopInfo/remarks.ll
+++ b/polly/test/ScopInfo/remarks.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: remark: test/ScopInfo/remarks.c:4:7: SCoP begins here.
 ; CHECK: remark: test/ScopInfo/remarks.c:9:15: Inbounds assumption:    [N, M, Debug] -> {  : M <= 100 }
diff --git a/polly/test/ScopInfo/required-invariant-loop-bounds.ll b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
index 3bb5bfb0765e3..abf0b0e23855c 100644
--- a/polly/test/ScopInfo/required-invariant-loop-bounds.ll
+++ b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:       ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/restriction_in_dead_block.ll b/polly/test/ScopInfo/restriction_in_dead_block.ll
index dd6115c421d0c..487c585cb9d9c 100644
--- a/polly/test/ScopInfo/restriction_in_dead_block.ll
+++ b/polly/test/ScopInfo/restriction_in_dead_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not generate an empty invalid context only because the wrap
 ; in the second conditional will always happen if the block is executed.
diff --git a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
index e8df1eccd5945..702b7dc5e0049 100644
--- a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
+++ b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
@@ -1,5 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: bb124 => bb176
 ;
diff --git a/polly/test/ScopInfo/run-time-check-many-parameters.ll b/polly/test/ScopInfo/run-time-check-many-parameters.ll
index 2a8853322f1d5..559c38d2682ef 100644
--- a/polly/test/ScopInfo/run-time-check-many-parameters.ll
+++ b/polly/test/ScopInfo/run-time-check-many-parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; A valid Scop would print the list of it's statements, we check that we do not
 ; see that list.
diff --git a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
index 5e71e7a9d2a46..3cf4c40bdb60f 100644
--- a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
+++ b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
@@ -1,5 +1,6 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<detect>' -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
+; RUN: | FileCheck %s -check-prefix=DETECT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: for => return
 ;
diff --git a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
index 286f878f935f4..51ab81476d542 100644
--- a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
+++ b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void foo(float *A, float *B, float *C, long N) {
 ; 	for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
index 9f4d6f5895aeb..dd809ba156c79 100644
--- a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
+++ b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we introduce two ScopArrayInfo objects (or virtual arrays) for the %out variable
 ; as it is used as a memory base pointer (%0) but also as a scalar (%out.addr.0.lcssa).
diff --git a/polly/test/ScopInfo/scalar.ll b/polly/test/ScopInfo/scalar.ll
index db8371d96b118..812d2fddc3c8e 100644
--- a/polly/test/ScopInfo/scalar.ll
+++ b/polly/test/ScopInfo/scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/scalar_dependence_cond_br.ll b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
index a09bdaf06844e..59549f3dbbad5 100644
--- a/polly/test/ScopInfo/scalar_dependence_cond_br.ll
+++ b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int d) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll
index e71c515fa2d35..3f61d0d723046 100644
--- a/polly/test/ScopInfo/scalar_to_array.ll
+++ b/polly/test/ScopInfo/scalar_to_array.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; ModuleID = 'scalar_to_array.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index 66c50dcbe13f3..fa0c81fe9a48e 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
 
diff --git a/polly/test/ScopInfo/scev-invalidated.ll b/polly/test/ScopInfo/scev-invalidated.ll
index e0956df0b1e84..6b9efd4b37c7d 100644
--- a/polly/test/ScopInfo/scev-invalidated.ll
+++ b/polly/test/ScopInfo/scev-invalidated.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %if.then6---%return
 ;
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
index 4a280cc929e3a..6e2ed1240b071 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
index 777c0088c4ddd..d0e8a2accaa2c 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
index 15dea5a7f4dd8..9ffc30f7360e9 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
index 9ac6643564f7b..65f2f99b48c1b 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
index 1657d2f37d8ba..7c36f8d7f72e8 100644
--- a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
+++ b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; This test contains a infinite loop (bb13) and crashed the domain generation
 ; at some point. Just verify it does not anymore.
diff --git a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
index 76bb438d43ff7..c8a234e9cbce7 100644
--- a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
+++ b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-android"
 
diff --git a/polly/test/ScopInfo/sign_wrapped_set.ll b/polly/test/ScopInfo/sign_wrapped_set.ll
index 135976e7d51c6..93b63df1c5841 100644
--- a/polly/test/ScopInfo/sign_wrapped_set.ll
+++ b/polly/test/ScopInfo/sign_wrapped_set.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Domain :=
 ; CHECK-NEXT:       [srcHeight] -> { Stmt_for_cond6_preheader_us[i0] : 0 <= i0 <= -3 + srcHeight };
diff --git a/polly/test/ScopInfo/simple_loop_1.ll b/polly/test/ScopInfo/simple_loop_1.ll
index 1d9f5c2edebcb..e736f3382d905 100644
--- a/polly/test/ScopInfo/simple_loop_1.ll
+++ b/polly/test/ScopInfo/simple_loop_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_2.ll b/polly/test/ScopInfo/simple_loop_2.ll
index 877f860ba5a90..ae83dd633b96e 100644
--- a/polly/test/ScopInfo/simple_loop_2.ll
+++ b/polly/test/ScopInfo/simple_loop_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned.ll b/polly/test/ScopInfo/simple_loop_unsigned.ll
index d3834297e2668..c4a96e4381c94 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], unsigned N) {
 ;   unsigned i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_2.ll b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
index 1da6053a8316b..37e907dc006f3 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_2.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> {  :  }
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_3.ll b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
index 0d44bf64ffc18..7f2cf5caa1ce7 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_3.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> { : }
diff --git a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
index f70b3fa3ea21a..4df0d343b0fc9 100644
--- a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
+++ b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @.str = private unnamed_addr constant [17 x i8] c"Random Value: %d\00", align 1
diff --git a/polly/test/ScopInfo/smax.ll b/polly/test/ScopInfo/smax.ll
index 3ba2b35e7e503..8968e13192477 100644
--- a/polly/test/ScopInfo/smax.ll
+++ b/polly/test/ScopInfo/smax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
 
 define void @foo(ptr noalias %data, ptr noalias %ptr, i32 %x_pos, i32 %w) {
diff --git a/polly/test/ScopInfo/statistics.ll b/polly/test/ScopInfo/statistics.ll
index aa72db3065259..0a294f2016eba 100644
--- a/polly/test/ScopInfo/statistics.ll
+++ b/polly/test/ScopInfo/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -stats -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-DAG:  4 polly-scops      - Maximal number of loops in scops
diff --git a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
index 54832607f11d5..a46acb090b7fd 100644
--- a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Region__TO__Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_after_split.ll b/polly/test/ScopInfo/stmt_split_no_after_split.ll
index 0a4284bdd34f5..3a5ebf0725b10 100644
--- a/polly/test/ScopInfo/stmt_split_no_after_split.ll
+++ b/polly/test/ScopInfo/stmt_split_no_after_split.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT: 	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_dependence.ll b/polly/test/ScopInfo/stmt_split_no_dependence.ll
index ed2180407c68d..9edd0f0a13e59 100644
--- a/polly/test/ScopInfo/stmt_split_no_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_store.ll b/polly/test/ScopInfo/stmt_split_on_store.ll
index f35a07c8d7176..d645becb19583 100644
--- a/polly/test/ScopInfo/stmt_split_on_store.ll
+++ b/polly/test/ScopInfo/stmt_split_on_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
index 41721867f1764..1a1ccff4f02d6 100644
--- a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
+++ b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
index 0521525e272b3..594b36279d6bc 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
index 82a85aa5f0099..6c9f1c2cb5fd0 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
index 1f21c0ce7225f..07abe46ac0399 100644
--- a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_within_loop.ll b/polly/test/ScopInfo/stmt_split_within_loop.ll
index 580ffab567846..9a42ae3a37270 100644
--- a/polly/test/ScopInfo/stmt_split_within_loop.ll
+++ b/polly/test/ScopInfo/stmt_split_within_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
index 67e8f631312ea..ba4801d9a0006 100644
--- a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
+++ b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<delicm>' -polly-print-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The statement Stmt_for_if_else_1 should be removed because it has no
 ; sideeffects.  But it has a use of MemRef_tmp21 that must also be
diff --git a/polly/test/ScopInfo/switch-1.ll b/polly/test/ScopInfo/switch-1.ll
index 0f9e83210661b..0c3610185e6e0 100644
--- a/polly/test/ScopInfo/switch-1.ll
+++ b/polly/test/ScopInfo/switch-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-2.ll b/polly/test/ScopInfo/switch-2.ll
index 9defd41f25231..f0056da37955d 100644
--- a/polly/test/ScopInfo/switch-2.ll
+++ b/polly/test/ScopInfo/switch-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-3.ll b/polly/test/ScopInfo/switch-3.ll
index faaa4d0254db9..a1810bf6ef538 100644
--- a/polly/test/ScopInfo/switch-3.ll
+++ b/polly/test/ScopInfo/switch-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll
index c82e703a82965..00665fd75cbcd 100644
--- a/polly/test/ScopInfo/switch-4.ll
+++ b/polly/test/ScopInfo/switch-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-5.ll b/polly/test/ScopInfo/switch-5.ll
index 5a49be8d80975..2de3695649404 100644
--- a/polly/test/ScopInfo/switch-5.ll
+++ b/polly/test/ScopInfo/switch-5.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/switch-6.ll b/polly/test/ScopInfo/switch-6.ll
index 379981b167039..b859840ee111f 100644
--- a/polly/test/ScopInfo/switch-6.ll
+++ b/polly/test/ScopInfo/switch-6.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/switch-7.ll b/polly/test/ScopInfo/switch-7.ll
index 0c8efc590b9c9..f73d97f70b28d 100644
--- a/polly/test/ScopInfo/switch-7.ll
+++ b/polly/test/ScopInfo/switch-7.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<ast>' -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int c, int N) {
 ;      switch (c) {
diff --git a/polly/test/ScopInfo/tempscop-printing.ll b/polly/test/ScopInfo/tempscop-printing.ll
index 09cc95e42a584..4f02176569b73 100644
--- a/polly/test/ScopInfo/tempscop-printing.ll
+++ b/polly/test/ScopInfo/tempscop-printing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/test-wrapping-in-condition.ll b/polly/test/ScopInfo/test-wrapping-in-condition.ll
index d64bdf985c1d2..746350422d6b9 100644
--- a/polly/test/ScopInfo/test-wrapping-in-condition.ll
+++ b/polly/test/ScopInfo/test-wrapping-in-condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/truncate-1.ll b/polly/test/ScopInfo/truncate-1.ll
index d531dd8e5ab08..44222c88dfa77 100644
--- a/polly/test/ScopInfo/truncate-1.ll
+++ b/polly/test/ScopInfo/truncate-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (char i = 0; i < (char)N; i++)
diff --git a/polly/test/ScopInfo/truncate-2.ll b/polly/test/ScopInfo/truncate-2.ll
index 3f5d1faf4c377..c78a5337fdeba 100644
--- a/polly/test/ScopInfo/truncate-2.ll
+++ b/polly/test/ScopInfo/truncate-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (short i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/truncate-3.ll b/polly/test/ScopInfo/truncate-3.ll
index d20f375b9a2bd..5a80a873cd476 100644
--- a/polly/test/ScopInfo/truncate-3.ll
+++ b/polly/test/ScopInfo/truncate-3.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -pass-remarks-analysis=polly-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
+; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Signed-unsigned restriction: [p] -> {  : p <= -129 or p >= 128 }
 
diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll
index aa2be1003adcc..e2723a8a9a2e9 100644
--- a/polly/test/ScopInfo/two-loops-one-infinite.ll
+++ b/polly/test/ScopInfo/two-loops-one-infinite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not create a SCoP in the presence of infinite loops.
 ;
diff --git a/polly/test/ScopInfo/two-loops-right-after-each-other.ll b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
index 163642d9072e2..51f3c2d6eb875 100644
--- a/polly/test/ScopInfo/two-loops-right-after-each-other.ll
+++ b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop_1
diff --git a/polly/test/ScopInfo/undef_in_cond.ll b/polly/test/ScopInfo/undef_in_cond.ll
index 5fb08f82b3267..ef117612f6cb3 100644
--- a/polly/test/ScopInfo/undef_in_cond.ll
+++ b/polly/test/ScopInfo/undef_in_cond.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @fix_operands() nounwind {
diff --git a/polly/test/ScopInfo/unnamed_nonaffine.ll b/polly/test/ScopInfo/unnamed_nonaffine.ll
index 11418499702df..5b9f980591777 100644
--- a/polly/test/ScopInfo/unnamed_nonaffine.ll
+++ b/polly/test/ScopInfo/unnamed_nonaffine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED
 ;
 ;    void f(int *A, int b) {
 ;      int x;
diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll
index e23b3ae5404b5..163170ce74895 100644
--- a/polly/test/ScopInfo/unnamed_stmts.ll
+++ b/polly/test/ScopInfo/unnamed_stmts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that we generate numbered statement names in case
 ; no LLVM-IR names are used in the test case. We also verify, that we
diff --git a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
index 5bc136658ccab..daa1f8c783870 100644
--- a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
+++ b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ; Derived from test-suite/MultiSource/Applications/sgefa/blas.c
 ;
 ; The exit value of %i.0320 in land.rhs is not computable.
diff --git a/polly/test/ScopInfo/unprofitable_scalar-accs.ll b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
index 3f6bb937ded1a..ca8daa4de01a6 100644
--- a/polly/test/ScopInfo/unprofitable_scalar-accs.ll
+++ b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true  '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC
 
 ; Check the effect of -polly-unprofitable-scalar-accs
 
diff --git a/polly/test/ScopInfo/unsigned-condition.ll b/polly/test/ScopInfo/unsigned-condition.ll
index 608b6d6e50a36..0529ded1f6cfb 100644
--- a/polly/test/ScopInfo/unsigned-condition.ll
+++ b/polly/test/ScopInfo/unsigned-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, unsigned P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/unsigned-division-1.ll b/polly/test/ScopInfo/unsigned-division-1.ll
index 58d39dc239ac9..1c06b55300b67 100644
--- a/polly/test/ScopInfo/unsigned-division-1.ll
+++ b/polly/test/ScopInfo/unsigned-division-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-2.ll b/polly/test/ScopInfo/unsigned-division-2.ll
index cda666d6f5ebf..153639c42b384 100644
--- a/polly/test/ScopInfo/unsigned-division-2.ll
+++ b/polly/test/ScopInfo/unsigned-division-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2 + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-3.ll b/polly/test/ScopInfo/unsigned-division-3.ll
index 50de3c59892e7..34561fc4645cc 100644
--- a/polly/test/ScopInfo/unsigned-division-3.ll
+++ b/polly/test/ScopInfo/unsigned-division-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i <= N / -128; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-4.ll b/polly/test/ScopInfo/unsigned-division-4.ll
index 4dd75e526407d..be539b47123bc 100644
--- a/polly/test/ScopInfo/unsigned-division-4.ll
+++ b/polly/test/ScopInfo/unsigned-division-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i < (N / -128) + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-5.ll b/polly/test/ScopInfo/unsigned-division-5.ll
index fff131292271a..61716ecec0d90 100644
--- a/polly/test/ScopInfo/unsigned-division-5.ll
+++ b/polly/test/ScopInfo/unsigned-division-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/unsigned_wrap_uge.ll b/polly/test/ScopInfo/unsigned_wrap_uge.ll
index f54b9bec6e7df..d25a9576e863a 100644
--- a/polly/test/ScopInfo/unsigned_wrap_uge.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_uge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ugt.ll b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
index 20afd17f86793..0310fdde6d26e 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ugt.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ule.ll b/polly/test/ScopInfo/unsigned_wrap_ule.ll
index 6fa6cc12990a3..47bfc6065b1a8 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ule.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ult.ll b/polly/test/ScopInfo/unsigned_wrap_ult.ll
index 4a3b604d81f0f..1b73c0d6dd7ee 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ult.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ult.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/user_context.ll b/polly/test/ScopInfo/user_context.ll
index ce8dd921cec16..74088120e4015 100644
--- a/polly/test/ScopInfo/user_context.ll
+++ b/polly/test/ScopInfo/user_context.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-polly-context=[N] -> {: N = 1024}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX
-; RUN: opt %loadNPMPolly '-polly-context=[N,M] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-polly-context=[] -> {: 1 = 0}' '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly                                      '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-context='[N] -> {: N = 1024}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX
+; RUN: opt %loadNPMPolly -polly-context='[N,M] -> {: 1 = 0}'  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-context='[] -> {: 1 = 0}'     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
index c35ed9060e504..bd13ba8bb6961 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK: remark: <unknown>:0:0: Use user assumption: [n, b] -> {  : n <= 100 or (b = 0 and n >= 101) }
 ;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
index 2afe99fd2c53b..45f59170942ed 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Context:
 ; CHECK-NEXT: [n] -> {  : -9223372036854775808 <= n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
index 3479558062671..fb71c75aa75e4 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK:      remark: <unknown>:0:0: SCoP begins here.
 ; REMARK-NEXT: remark: <unknown>:0:0: Use user assumption: [n] -> {  : n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions.ll b/polly/test/ScopInfo/user_provided_assumptions.ll
index 0bd99ea3fcb35..49b23b1e784dc 100644
--- a/polly/test/ScopInfo/user_provided_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [M, N] -> {  : N <= 2147483647 - M }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_2.ll b/polly/test/ScopInfo/user_provided_assumptions_2.ll
index 1499ab98f7369..f8643b68cc63f 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_2.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: { : }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_3.ll b/polly/test/ScopInfo/user_provided_assumptions_3.ll
index aa1f72dddde9d..70f8f359e16cd 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_3.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [N] -> { : N >= 2 }
diff --git a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
index a6eed5df2063e..3e7883db48fcb 100644
--- a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [i, N, M] -> {  : N <= i or (N > i and N >= 0) }
@@ -17,7 +18,8 @@
 ;
 
 
-; RUN: opt %loadNPMPolly -pass-remarks-analysis=polly-scops '-passes=polly-custom<scops>' -polly-print-scops -polly-precise-inbounds -disable-output -pass-remarks-output=%t.yaml < %s 2>&1
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
+; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 ; YAML: --- !Analysis
 ; YAML: Pass:            polly-scops
diff --git a/polly/test/ScopInfo/variant_base_pointer.ll b/polly/test/ScopInfo/variant_base_pointer.ll
index 36beaf5f0f016..32cb114fab05a 100644
--- a/polly/test/ScopInfo/variant_base_pointer.ll
+++ b/polly/test/ScopInfo/variant_base_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-detect -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=polly<no-default-opts>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -passes=polly-codegen -disable-output < %s
 ;
 ; %tmp is added to the list of required hoists by -polly-scops and just
 ; assumed to be hoisted. Only -polly-scops recognizes it to be unhoistable
diff --git a/polly/test/ScopInfo/variant_load_empty_domain.ll b/polly/test/ScopInfo/variant_load_empty_domain.ll
index 5602c443b25d3..6a28bd0405fdd 100644
--- a/polly/test/ScopInfo/variant_load_empty_domain.ll
+++ b/polly/test/ScopInfo/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT: }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_0.ll b/polly/test/ScopInfo/wraping_signed_expr_0.ll
index 3a663f57c2774..f5f06bfd7d336 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_0.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_0.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, char N, char p) {
 ;      for (char i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/wraping_signed_expr_1.ll b/polly/test/ScopInfo/wraping_signed_expr_1.ll
index 8963e86bc6157..e04257acc2010 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(long *A, long N, long p) {
 ;      for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_2.ll b/polly/test/ScopInfo/wraping_signed_expr_2.ll
index 97cb2c05b16a0..2511c0d646086 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_3.ll b/polly/test/ScopInfo/wraping_signed_expr_3.ll
index 50e2eda2ce574..2106bdf4c0686 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_3.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_4.ll b/polly/test/ScopInfo/wraping_signed_expr_4.ll
index 4ddb43a01bf24..3ea17f6e266bf 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_4.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, char N, char p) {
 ;      for (char i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_5.ll b/polly/test/ScopInfo/wraping_signed_expr_5.ll
index 440d32bab72a5..90706a3d3bc46 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_5.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; We should not generate runtime check for ((int)r1 + (int)r2) as it is known not
 ; to overflow. However (p + q) can, thus checks are needed.
diff --git a/polly/test/ScopInfo/wraping_signed_expr_6.ll b/polly/test/ScopInfo/wraping_signed_expr_6.ll
index 7bec9533440fb..9cf67fc101805 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_6.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_7.ll b/polly/test/ScopInfo/wraping_signed_expr_7.ll
index 2d836e191f858..d18d2b2df3e12 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_7.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
index 4964a123d0be1..84626861bd39b 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
index a6db7c06d072c..b4dd567bafa6b 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll
index b509951bbf0d5..cbe4af05169f8 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned *restrict I, unsigned *restrict A, unsigned N, unsigned M) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
index ea3356e01cc9f..b306045276765 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned long *restrict I, unsigned *restrict A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_space_mismatch.ll b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
index 9fd1afae4b889..3c02ae295b5ba 100644
--- a/polly/test/ScopInfo/zero_ext_space_mismatch.ll
+++ b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [dim] -> {  : dim > 0 }
diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll
index 85198b728a9bb..5c0cfa103f0bf 100644
--- a/polly/test/ScopInliner/ignore-declares.ll
+++ b/polly/test/ScopInliner/ignore-declares.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s
 
 ; Check that we do not crash if there are declares. We should skip function
 ; declarations and not try to query for domtree.
diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll
index 6046fc0f38650..58c556a455fb9 100644
--- a/polly/test/ScopInliner/invariant-load-func.ll
+++ b/polly/test/ScopInliner/invariant-load-func.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we inline a function that requires invariant load hoisting
 ; correctly.
diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll
index 77a5ddda93adc..f12798a3d831a 100644
--- a/polly/test/ScopInliner/simple-inline-loop.ll
+++ b/polly/test/ScopInliner/simple-inline-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),polly-custom<print-scops>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s  2>&1 | FileCheck %s
 
 ; Check that we get the 2 nested loops by inlining `to_be_inlined` into
 ; `inline_site`.
diff --git a/polly/test/Simplify/coalesce_3partials.ll b/polly/test/Simplify/coalesce_3partials.ll
index 5411b6e430c66..4112787e51bfa 100644
--- a/polly/test/Simplify/coalesce_3partials.ll
+++ b/polly/test/Simplify/coalesce_3partials.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine 3 partial accesses into one.
 ;
diff --git a/polly/test/Simplify/coalesce_disjointelements.ll b/polly/test/Simplify/coalesce_disjointelements.ll
index 888daeff39d8d..b140f287e27f7 100644
--- a/polly/test/Simplify/coalesce_disjointelements.ll
+++ b/polly/test/Simplify/coalesce_disjointelements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine four partial stores into two.
 ; The stores write to the same array, but never the same element.
diff --git a/polly/test/Simplify/coalesce_overlapping.ll b/polly/test/Simplify/coalesce_overlapping.ll
index f492222461b34..ee716fc12f095 100644
--- a/polly/test/Simplify/coalesce_overlapping.ll
+++ b/polly/test/Simplify/coalesce_overlapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with overlapping domains) into one.
 ;
diff --git a/polly/test/Simplify/coalesce_partial.ll b/polly/test/Simplify/coalesce_partial.ll
index 4df91d43fc46d..aea691f43e934 100644
--- a/polly/test/Simplify/coalesce_partial.ll
+++ b/polly/test/Simplify/coalesce_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with disjoint domains) into one.
 ;
diff --git a/polly/test/Simplify/dead_access_load.ll b/polly/test/Simplify/dead_access_load.ll
index 399c02381c890..66f94795ea6e4 100644
--- a/polly/test/Simplify/dead_access_load.ll
+++ b/polly/test/Simplify/dead_access_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead load-instruction
 ; (an load whose result is not used anywhere)
diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll
index 9344a284b311a..fb40e4cc45b35 100644
--- a/polly/test/Simplify/dead_access_phi.ll
+++ b/polly/test/Simplify/dead_access_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead PHI write/read pair
 ; (accesses that are effectively not used)
diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll
index 6db242c97dac0..a8ff7f28542b7 100644
--- a/polly/test/Simplify/dead_access_value.ll
+++ b/polly/test/Simplify/dead_access_value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead value write/read pair
 ; (accesses that are effectively not used)
diff --git a/polly/test/Simplify/dead_instruction.ll b/polly/test/Simplify/dead_instruction.ll
index 785b5ba154187..81e55e1c7bb30 100644
--- a/polly/test/Simplify/dead_instruction.ll
+++ b/polly/test/Simplify/dead_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead instruction
 ; (an instruction whose result is not used anywhere)
diff --git a/polly/test/Simplify/emptyaccessdomain.ll b/polly/test/Simplify/emptyaccessdomain.ll
index 917ae7f7d2c94..9b06cec965a9d 100644
--- a/polly/test/Simplify/emptyaccessdomain.ll
+++ b/polly/test/Simplify/emptyaccessdomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; for (int j = 0; j < n; j += 1) {
 ;   A[0] = 42.0;
diff --git a/polly/test/Simplify/exit_phi_accesses-2.ll b/polly/test/Simplify/exit_phi_accesses-2.ll
index d56fed4848ff3..379c7e0ace0a3 100644
--- a/polly/test/Simplify/exit_phi_accesses-2.ll
+++ b/polly/test/Simplify/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s | FileCheck %s
 ;
 ; The use of %sum.next by %phi counts as an escaping use.
 ; Don't remove the scalar write of %sum.next.
diff --git a/polly/test/Simplify/func-b320a7.ll b/polly/test/Simplify/func-b320a7.ll
index 65aa9cd28314e..5aa2caba95cfc 100644
--- a/polly/test/Simplify/func-b320a7.ll
+++ b/polly/test/Simplify/func-b320a7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<optree;simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>,polly-optree' -disable-output < %s | FileCheck %s -match-full-lines
 
 ; llvm.org/PR47098
 ; Use-after-free by reference to Stmt remaining in InstStmtMap after removing it has been removed by Scop::simplifyScop.
diff --git a/polly/test/Simplify/gemm.ll b/polly/test/Simplify/gemm.ll
index 6e3a43e0ebbad..5120de2db7677 100644
--- a/polly/test/Simplify/gemm.ll
+++ b/polly/test/Simplify/gemm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ;    void gemm(float A[][1024], float B[][1024], float C[][1024]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/Simplify/nocoalesce_differentvalues.ll b/polly/test/Simplify/nocoalesce_differentvalues.ll
index cba62549227ae..33d04b2f96de8 100644
--- a/polly/test/Simplify/nocoalesce_differentvalues.ll
+++ b/polly/test/Simplify/nocoalesce_differentvalues.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that write different values.
 ;
diff --git a/polly/test/Simplify/nocoalesce_elementmismatch.ll b/polly/test/Simplify/nocoalesce_elementmismatch.ll
index b589d13779e52..608b055e691df 100644
--- a/polly/test/Simplify/nocoalesce_elementmismatch.ll
+++ b/polly/test/Simplify/nocoalesce_elementmismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that do not write to different elements in the
 ; same instance.
diff --git a/polly/test/Simplify/nocoalesce_readbetween.ll b/polly/test/Simplify/nocoalesce_readbetween.ll
index b61ad9d8031e0..e112b036cd778 100644
--- a/polly/test/Simplify/nocoalesce_readbetween.ll
+++ b/polly/test/Simplify/nocoalesce_readbetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a read between them.
 ; Note: The read between is unused, so will be removed by markAndSweep.
diff --git a/polly/test/Simplify/nocoalesce_writebetween.ll b/polly/test/Simplify/nocoalesce_writebetween.ll
index be7d159554034..fd5eee52eaf5c 100644
--- a/polly/test/Simplify/nocoalesce_writebetween.ll
+++ b/polly/test/Simplify/nocoalesce_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a write between them.
 ;
diff --git a/polly/test/Simplify/notdead_region_exitphi.ll b/polly/test/Simplify/notdead_region_exitphi.ll
index 1bd9bfe10a99d..42fafb446cea3 100644
--- a/polly/test/Simplify/notdead_region_exitphi.ll
+++ b/polly/test/Simplify/notdead_region_exitphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node in a region's exit block.
 ;
diff --git a/polly/test/Simplify/notdead_region_innerphi.ll b/polly/test/Simplify/notdead_region_innerphi.ll
index b59d6dc60b089..966448c9884b2 100644
--- a/polly/test/Simplify/notdead_region_innerphi.ll
+++ b/polly/test/Simplify/notdead_region_innerphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node within a region statement (%phi).
 ;
diff --git a/polly/test/Simplify/notredundant_region_loop.ll b/polly/test/Simplify/notredundant_region_loop.ll
index 859bd459f72d6..88f6c41521739 100644
--- a/polly/test/Simplify/notredundant_region_loop.ll
+++ b/polly/test/Simplify/notredundant_region_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the store in region_entry. It can be executed multiple times
 ; due to being part of a non-affine loop.
diff --git a/polly/test/Simplify/notredundant_region_middle.ll b/polly/test/Simplify/notredundant_region_middle.ll
index a742ea889fb1f..43c05436809ba 100644
--- a/polly/test/Simplify/notredundant_region_middle.ll
+++ b/polly/test/Simplify/notredundant_region_middle.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove redundant stores in the middle of region statements.
 ; The store in region_true could be removed, but in practice we do try to
diff --git a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
index 8542b7927f860..8a9aec8be9e05 100644
--- a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
+++ b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the scalar value write of %i.trunc in inner.for.
 ; It is used by body.
diff --git a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
index 06b082c3f81fa..7218f328f9ca3 100644
--- a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-scops -polly-print-simplify -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; %tmp5 must keep the Value WRITE MemoryAccess, because as an incoming value of
 ; %tmp4, it is an "external use".
diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll
index bc5b2dffd443d..eccdd8044d073 100644
--- a/polly/test/Simplify/overwritten.ll
+++ b/polly/test/Simplify/overwritten.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ;
diff --git a/polly/test/Simplify/overwritten_3phi.ll b/polly/test/Simplify/overwritten_3phi.ll
index 861c9acda3e9c..4cee4f13d26d0 100644
--- a/polly/test/Simplify/overwritten_3phi.ll
+++ b/polly/test/Simplify/overwritten_3phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll
index cfd5a08143d60..c9f06c85dba53 100644
--- a/polly/test/Simplify/overwritten_3store.ll
+++ b/polly/test/Simplify/overwritten_3store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that even multiple stores are removed.
diff --git a/polly/test/Simplify/overwritten_implicit_and_explicit.ll b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
index 306e726e7808a..b1b7635e26263 100644
--- a/polly/test/Simplify/overwritten_implicit_and_explicit.ll
+++ b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that this works even if one of the writes is a scalar MemoryKind.
diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll
index 170838ddb8a1a..cdca2f11531e7 100644
--- a/polly/test/Simplify/overwritten_loadbetween.ll
+++ b/polly/test/Simplify/overwritten_loadbetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Do not remove overwrites when the value is read before.
 ;
diff --git a/polly/test/Simplify/overwritten_scalar.ll b/polly/test/Simplify/overwritten_scalar.ll
index a1e7da40554d5..700adb6aed2ec 100644
--- a/polly/test/Simplify/overwritten_scalar.ll
+++ b/polly/test/Simplify/overwritten_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/pass_existence.ll b/polly/test/Simplify/pass_existence.ll
index 6d9c99f9dc270..4d1d800b2a80b 100644
--- a/polly/test/Simplify/pass_existence.ll
+++ b/polly/test/Simplify/pass_existence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -disable-output '-passes=polly-custom<simplify>' -polly-print-simplify -aa-pipeline=basic-aa < %s < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output "-passes=scop(print<polly-simplify>)" < %s -aa-pipeline=basic-aa < %s | FileCheck %s
 ;
 ; Simple test for the existence of the Simplify pass.
 ;
diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll
index ba1cffee1a0df..2bb05738955a3 100644
--- a/polly/test/Simplify/phi_in_regionstmt.ll
+++ b/polly/test/Simplify/phi_in_regionstmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; The PHINode %cond91.sink.sink.us.sink.6 is in the middle of a region
 ; statement.
diff --git a/polly/test/Simplify/pr33323.ll b/polly/test/Simplify/pr33323.ll
index 5130eb8488ca2..22921d5fba509 100644
--- a/polly/test/Simplify/pr33323.ll
+++ b/polly/test/Simplify/pr33323.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR33323
 ;
diff --git a/polly/test/Simplify/redundant.ll b/polly/test/Simplify/redundant.ll
index f2489a74eb899..540e537460e54 100644
--- a/polly/test/Simplify/redundant.ll
+++ b/polly/test/Simplify/redundant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination)
diff --git a/polly/test/Simplify/redundant_differentindex.ll b/polly/test/Simplify/redundant_differentindex.ll
index efd20e90ae748..5ce25836dedbd 100644
--- a/polly/test/Simplify/redundant_differentindex.ll
+++ b/polly/test/Simplify/redundant_differentindex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; A store that has a different index than the load it is storing is
 ; not redundant.
diff --git a/polly/test/Simplify/redundant_partialwrite.ll b/polly/test/Simplify/redundant_partialwrite.ll
index 357b63206b0f5..ac5ca907fff6f 100644
--- a/polly/test/Simplify/redundant_partialwrite.ll
+++ b/polly/test/Simplify/redundant_partialwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-import-jscop-postfix=transformed '-passes=polly-custom<import-jscop;simplify>' -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadPolly -polly-import-jscop-postfix=transformed -polly-print-import-jscop -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a redundant store, if its partial domain is a subset of the
 ; read's domain.
diff --git a/polly/test/Simplify/redundant_region.ll b/polly/test/Simplify/redundant_region.ll
index c60d28b7039dd..927aac6c4af05 100644
--- a/polly/test/Simplify/redundant_region.ll
+++ b/polly/test/Simplify/redundant_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_region_scalar.ll b/polly/test/Simplify/redundant_region_scalar.ll
index 3de50c04b614f..72d570d46bdce 100644
--- a/polly/test/Simplify/redundant_region_scalar.ll
+++ b/polly/test/Simplify/redundant_region_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_scalarwrite.ll b/polly/test/Simplify/redundant_scalarwrite.ll
index 13ca40f8e1b87..84cb971be11fd 100644
--- a/polly/test/Simplify/redundant_scalarwrite.ll
+++ b/polly/test/Simplify/redundant_scalarwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant scalar stores.
 ;
diff --git a/polly/test/Simplify/redundant_storebetween.ll b/polly/test/Simplify/redundant_storebetween.ll
index 47d9cfde2d3ce..6540d7751e469 100644
--- a/polly/test/Simplify/redundant_storebetween.ll
+++ b/polly/test/Simplify/redundant_storebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Don't remove store where there is another store to the same target
 ; in-between them.
diff --git a/polly/test/Simplify/scalability1.ll b/polly/test/Simplify/scalability1.ll
index 969aade275af2..c6e36f9dcdefb 100644
--- a/polly/test/Simplify/scalability1.ll
+++ b/polly/test/Simplify/scalability1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/scalability2.ll b/polly/test/Simplify/scalability2.ll
index 7951094867f2f..adcf9eef348a9 100644
--- a/polly/test/Simplify/scalability2.ll
+++ b/polly/test/Simplify/scalability2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_phi.ll b/polly/test/Simplify/sweep_mapped_phi.ll
index ad41f2566e2b5..495d77a22f618 100644
--- a/polly/test/Simplify/sweep_mapped_phi.ll
+++ b/polly/test/Simplify/sweep_mapped_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %phi to A[j], so the scalar write in Stmt_for_bodyA can be removed.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_value.ll b/polly/test/Simplify/sweep_mapped_value.ll
index a50c013ac7917..c83941a8f0ba5 100644
--- a/polly/test/Simplify/sweep_mapped_value.ll
+++ b/polly/test/Simplify/sweep_mapped_value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<import-jscop;simplify-0>' -polly-print-simplify -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %val to A[j], so the scalar write on Stmt_for_bodyB can be removed.
 ;
diff --git a/polly/test/Simplify/ununsed_read_in_region_entry.ll b/polly/test/Simplify/ununsed_read_in_region_entry.ll
index 4c05de975fdf8..f2436c263a96a 100644
--- a/polly/test/Simplify/ununsed_read_in_region_entry.ll
+++ b/polly/test/Simplify/ununsed_read_in_region_entry.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<simplify>' -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
-; RUN: opt %loadNPMPolly '-passes=polly<no-default-opts;simplify>' -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>' -disable-output< %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-simplify,polly-codegen' -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 ; for (int i = 0; i < n; i+=1) {
 ;    (void)A[0];
diff --git a/polly/test/Support/Plugins.ll b/polly/test/Support/Plugins.ll
index b75dd872ad404..872a32fad4fed 100644
--- a/polly/test/Support/Plugins.ll
+++ b/polly/test/Support/Plugins.ll
@@ -1,4 +1,5 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<prepare;ast>' -polly-print-ast -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -S < %s \
+; RUN: | FileCheck %s
 
 ; This testcase tests plugin registration. Check-lines below serve to verify
 ; that the passes actually ran.
diff --git a/polly/test/Support/exportjson.ll b/polly/test/Support/exportjson.ll
index 6bdf5a4c33cf3..22cfea23534cb 100644
--- a/polly/test/Support/exportjson.ll
+++ b/polly/test/Support/exportjson.ll
@@ -1,6 +1,6 @@
 ; RUN: rm -rf %t
 ; RUN: mkdir -p %t
-; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t '-passes=polly-custom<export-jscop>' -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-import-jscop-dir=%t -polly -O2 -polly-export -S < %s
 ; RUN: FileCheck %s -input-file %t/exportjson___%entry.split---%return.jscop
 ;
 ; for (int j = 0; j < n; j += 1) {
@@ -9,22 +9,28 @@
 ;
 define void @exportjson(i32 %n, ptr noalias nonnull %A) {
 entry:
-  br label %entry.split
+  br label %for
 
-entry.split:
-  %j.cmp1 = icmp sgt i32 %n, 0
-  br i1 %j.cmp1, label %body.lr.ph, label %return
+for:
+  %j = phi i32 [0, %entry], [%j.inc, %inc]
+  %j.cmp = icmp slt i32 %j, %n
+  br i1 %j.cmp, label %body, label %exit
 
-body.lr.ph:
-  store double 4.200000e+01, ptr %A, align 8
+    body:
+      store double 42.0, ptr %A
+      br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %j, 1
+  br label %for
+
+exit:
   br label %return
 
 return:
   ret void
 }
 
-attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
-
 
 ; CHECK:      {
 ; CHECK-NEXT:    "arrays": [
diff --git a/polly/test/Support/isl-args.ll b/polly/test/Support/isl-args.ll
index 6c8b2e97682e8..206cb73bfc5ab 100644
--- a/polly/test/Support/isl-args.ll
+++ b/polly/test/Support/isl-args.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
-; RUN: not opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=-asdf < %s 2>&1 | FileCheck %s -match-full-lines --check-prefix=UNKNOWN
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
+; RUN: not opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
 
 ; VERSION: isl-{{.*}}-IMath-32
 ; HELP: Usage: -polly-isl-arg [OPTION...]
diff --git a/polly/test/Support/pipelineposition.ll b/polly/test/Support/pipelineposition.ll
index 1ddfb5879ce16..a4506ba1d64ed 100644
--- a/polly/test/Support/pipelineposition.ll
+++ b/polly/test/Support/pipelineposition.ll
@@ -1,6 +1,8 @@
-; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early                    -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE
-; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1
-; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer        -disable-output -polly-print-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3
+; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early                    -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=NOINLINE
+; RUN: opt %loadNPMPolly -O3 -polly -polly-position=early -polly-run-inliner -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED1
+; RUN: opt %loadNPMPolly -O3 -polly -polly-position=before-vectorizer        -disable-output -debug-only=polly-scops < %s 2>&1 | FileCheck %s --check-prefix=INLINED3
+;
+; REQUIRES: asserts
 ;
 ; void callee(int n, double A[], int i) {
 ;   for (int j = 0; j < n; j += 1)
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
index ca901b8825ced..f22063e796def 100644
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -38,10 +38,14 @@ if config.llvm_polly_link_into_tools == '' or \
    config.llvm_polly_link_into_tools.lower() == 'false' or \
    config.llvm_polly_link_into_tools.lower() == 'notfound' or \
    config.llvm_polly_link_into_tools.lower() == 'llvm_polly_link_into_tools-notfound':
+    config.substitutions.append(('%loadPolly', '-load '
+                                 + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@'
+                                 + commonOpts ))
     config.substitutions.append(('%loadNPMPolly', '-load-pass-plugin '
                                  + config.polly_lib_dir + '/LLVMPolly@LLVM_SHLIBEXT@'
                                  + commonOpts ))
 else:
+    config.substitutions.append(('%loadPolly', commonOpts ))
     config.substitutions.append(('%loadNPMPolly', commonOpts ))
 
 import lit.llvm
diff --git a/polly/test/polly.ll b/polly/test/polly.ll
index 0f5467b0e654d..2e455b39a9cd4 100644
--- a/polly/test/polly.ll
+++ b/polly/test/polly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly '-passes=polly-custom<scops>' -polly-print-scops -S < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -S < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @foo() nounwind {
 start:

From bc08e69959ecefecc7ea41b648a659aa19c458c8 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Tue, 4 Nov 2025 07:24:56 +0530
Subject: [PATCH 110/313] [clang][modules] Fix crash in enum visibility lookup
 for C++20 header units (#166272)

Fixes #165445.

Fixes a crash when `ASTWriter::GenerateNameLookupTable` processes enum
constants from C++20 header units.

The special handling for enum constants, introduced in fccc6ee, doesn't
account for declarations whose owning module is a C++20 header unit. It
calls `isNamedModule()` on the result of
`getTopLevelOwningNamedModule()`, which returns null for header units,
causing a null pointer dereference.
---
 clang/lib/Serialization/ASTWriter.cpp         |  3 +-
 ...rash-enum-visibility-with-header-unit.cppm | 46 +++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/crash-enum-visibility-with-header-unit.cppm

diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 3ac338e013deb..b1fd151790d96 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -4374,8 +4374,7 @@ class ASTDeclContextNameLookupTrait
     // parent of parent. We DON'T remove the enum constant from its parent. So
     // we don't need to care about merging problems here.
     if (auto *ECD = dyn_cast<EnumConstantDecl>(D);
-        ECD && DC.isFileContext() && ECD->getOwningModule() &&
-        ECD->getTopLevelOwningNamedModule()->isNamedModule()) {
+        ECD && DC.isFileContext() && ECD->getTopLevelOwningNamedModule()) {
       if (llvm::all_of(
               DC.noload_lookup(
                   cast<EnumDecl>(ECD->getDeclContext())->getDeclName()),
diff --git a/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
new file mode 100644
index 0000000000000..90c57796dcf7e
--- /dev/null
+++ b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
@@ -0,0 +1,46 @@
+// Fixes #165445
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -x c++-user-header %t/header.h \
+// RUN:   -emit-header-unit -o %t/header.pcm
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -fmodule-file=%t/header.pcm \
+// RUN:   -emit-module-interface -o %t/A.pcm
+// 
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=%t/header.pcm \
+// RUN:   -emit-module-interface -o %t/B.pcm
+//
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp \
+// RUN:   -fmodule-file=A=%t/A.pcm -fmodule-file=B=%t/B.pcm  \
+// RUN:   -fmodule-file=%t/header.pcm \
+// RUN:   -verify -fsyntax-only
+
+//--- enum.h
+enum E { Value };
+
+//--- header.h
+#include "enum.h"
+
+//--- A.cppm
+module;
+#include "enum.h"
+export module A;
+
+auto e = Value;
+
+//--- B.cppm
+export module B;
+import "header.h";
+
+auto e = Value;
+
+//--- use.cpp
+// expected-no-diagnostics
+import A;
+import B;
+#include "enum.h"
+
+auto e = Value;

From 03e78a9976cf7a313c841732432002cbb4146d10 Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qcf@ecnelises.com>
Date: Tue, 4 Nov 2025 10:18:53 +0800
Subject: [PATCH 111/313] [Clang][Sema] Check null after
 ExtractTypeForDeductionGuide (#165776)

---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp |  3 ++-
 clang/test/SemaTemplate/ctad.cpp              | 12 ++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index db695d86b5416..2c575cd1d4289 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -505,6 +505,7 @@ Bug Fixes to C++ Support
   nontrivial member when another member has an initializer. (#GH81774)
 - Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092)
 - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
+- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index ad50600f6399c..bfcd3978817ca 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -659,7 +659,8 @@ struct ConvertConstructorToDeductionGuideTransform {
                 SemaRef, MaterializedTypedefs, NestedPattern,
                 TransformingOuterPatterns ? &Args : nullptr)
                 .transform(NewDI);
-
+    if (!NewDI)
+      return nullptr;
     // Resolving a wording defect, we also inherit default arguments from the
     // constructor.
     ExprResult NewDefArg;
diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp
index 1a575ea527006..60603f0c963a5 100644
--- a/clang/test/SemaTemplate/ctad.cpp
+++ b/clang/test/SemaTemplate/ctad.cpp
@@ -104,3 +104,15 @@ namespace ConvertDeducedTemplateArgument {
 
   auto x = C(D<A::B>());
 }
+
+namespace pr165560 {
+template <class T, class> struct S {
+  using A = T;
+  template <class> struct I { // expected-note{{candidate function template not viable: requires 1 argument, but 0 were provided}} \
+                              // expected-note{{implicit deduction guide declared as 'template <class> I(pr165560::S<int, int>::I<type-parameter-0-0>) -> pr165560::S<int, int>::I<type-parameter-0-0>'}}
+    I(typename A::F) {} // expected-error{{type 'A' (aka 'int') cannot be used prior to '::' because it has no members}}
+  };
+};
+S<int, int>::I i; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'S<int, int>::I'}} \
+                  // expected-note{{while building implicit deduction guide first needed here}}
+}

From 749a0e1e410b4c47de68dee295ad2d08283050c4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 18:19:58 -0800
Subject: [PATCH 112/313] update_llc_test_checks: Recognize thumbv7k triples
 (#166270)

---
 llvm/utils/UpdateTestChecks/asm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index 469e27facedb0..61f0d679f806d 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -576,6 +576,7 @@ def get_run_handler(triple):
         "armv7-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE),
         "armv7-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
         "armv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
+        "thumbv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
         "thumb": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
         "thumb-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE),
         "thumbv5-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE),

From 4d98ee2a22242bbe12f2fdb13ee49be697546259 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 18:20:24 -0800
Subject: [PATCH 113/313] ARM: Add watchos run line to llvm.sincos test
 (#166271)

---
 llvm/test/CodeGen/ARM/llvm.sincos.ll | 221 +++++++++++++++++++++++++++
 1 file changed, 221 insertions(+)

diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll
index 0c2263ee9acbf..1448fac8d864f 100644
--- a/llvm/test/CodeGen/ARM/llvm.sincos.ll
+++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s
 ; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s
 ; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s
 
 define { half, half } @test_sincos_f16(half %a) {
 ; GNU-LABEL: test_sincos_f16:
@@ -75,6 +76,23 @@ define { half, half } @test_sincos_f16(half %a) {
 ; IOS-WITH-STRET-NEXT:    mov r0, r5
 ; IOS-WITH-STRET-NEXT:    add sp, sp, #8
 ; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s1, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   ret { half, half } %result
 }
@@ -130,6 +148,22 @@ define half @test_sincos_f16_only_use_sin(half %a) {
 ; IOS-WITH-STRET-NEXT:    add sp, sp, #8
 ; IOS-WITH-STRET-NEXT:    pop {lr}
 ; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_sin:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
@@ -186,6 +220,22 @@ define half @test_sincos_f16_only_use_cos(half %a) {
 ; IOS-WITH-STRET-NEXT:    add sp, sp, #8
 ; IOS-WITH-STRET-NEXT:    pop {lr}
 ; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_cos:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
@@ -366,6 +416,54 @@ define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) {
 ; IOS-WITH-STRET-NEXT:    add sp, sp, #24
 ; IOS-WITH-STRET-NEXT:    vpop {d8}
 ; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vpush {d8}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    vmov.f32 s16, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s4, s16
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s4
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #6]
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s20
+; WATCHABI-NEXT:    strh.w r0, [sp, #4]
+; WATCHABI-NEXT:    add r0, sp, #4
+; WATCHABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #2]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmovl.u16 q0, d16
+; WATCHABI-NEXT:    strh.w r0, [sp]
+; WATCHABI-NEXT:    mov r0, sp
+; WATCHABI-NEXT:    vld1.32 {d18[0]}, [r0:32]
+; WATCHABI-NEXT:    vmovl.u16 q1, d18
+; WATCHABI-NEXT:    vmov.f32 s2, s4
+; WATCHABI-NEXT:    vmov.f32 s3, s5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8}
+; WATCHABI-NEXT:    vpop {d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
@@ -416,6 +514,20 @@ define { float, float } @test_sincos_f32(float %a) {
 ; IOS-WITH-STRET-NEXT:    pop {r0, r1}
 ; IOS-WITH-STRET-NEXT:    pop {lr}
 ; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { float, float } @llvm.sincos.f32(float %a)
   ret { float, float } %result
 }
@@ -519,6 +631,33 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) {
 ; IOS-WITH-STRET-NEXT:    vpop {d8}
 ; IOS-WITH-STRET-NEXT:    pop {lr}
 ; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_v2f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d9, -24
+; WATCHABI-NEXT:    .cfi_offset d8, -32
+; WATCHABI-NEXT:    vmov.f64 d8, d0
+; WATCHABI-NEXT:    vmov.f32 s0, s17
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s19, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s16
+; WATCHABI-NEXT:    vmov.f32 s21, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    vmov.f32 s18, s0
+; WATCHABI-NEXT:    vmov.f64 d1, d10
+; WATCHABI-NEXT:    vmov.f64 d0, d9
+; WATCHABI-NEXT:    vpop {d8, d9, d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
@@ -581,6 +720,20 @@ define { double, double } @test_sincos_f64(double %a) {
 ; IOS-WITH-STRET-NEXT:    add sp, sp, #16
 ; IOS-WITH-STRET-NEXT:    pop {lr}
 ; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { double, double } @llvm.sincos.f64(double %a)
   ret { double, double } %result
 }
@@ -692,6 +845,39 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) {
 ; IOS-WITH-STRET-NEXT:    vst1.32 {d18, d19}, [r6]
 ; IOS-WITH-STRET-NEXT:    add sp, sp, #32
 ; IOS-WITH-STRET-NEXT:    pop {r4, r5, r6, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 56
+; WATCHABI-NEXT:    .cfi_offset d13, -16
+; WATCHABI-NEXT:    .cfi_offset d12, -24
+; WATCHABI-NEXT:    .cfi_offset d11, -32
+; WATCHABI-NEXT:    .cfi_offset d10, -40
+; WATCHABI-NEXT:    .cfi_offset d9, -48
+; WATCHABI-NEXT:    .cfi_offset d8, -56
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 64
+; WATCHABI-NEXT:    vorr q4, q0, q0
+; WATCHABI-NEXT:    vorr d0, d9, d9
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d11, d0, d0
+; WATCHABI-NEXT:    vorr d0, d8, d8
+; WATCHABI-NEXT:    vorr d13, d1, d1
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d12, d1, d1
+; WATCHABI-NEXT:    vorr d10, d0, d0
+; WATCHABI-NEXT:    vorr q1, q6, q6
+; WATCHABI-NEXT:    vorr q0, q5, q5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
@@ -778,6 +964,41 @@ define { fp128, fp128 } @test_sincos_f128(fp128 %a) {
 ; IOS-NEXT:    bl _sinl
 ; IOS-NEXT:    stm r4, {r0, r1, r2, r3}
 ; IOS-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; WATCHABI-LABEL: test_sincos_f128:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    .cfi_offset r6, -12
+; WATCHABI-NEXT:    .cfi_offset r5, -16
+; WATCHABI-NEXT:    .cfi_offset r4, -20
+; WATCHABI-NEXT:    .cfi_offset r8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    ldr.w r8, [sp, #32]
+; WATCHABI-NEXT:    mov r4, r0
+; WATCHABI-NEXT:    mov r5, r3
+; WATCHABI-NEXT:    mov r6, r2
+; WATCHABI-NEXT:    mov r7, r1
+; WATCHABI-NEXT:    mov r0, r1
+; WATCHABI-NEXT:    mov r1, r2
+; WATCHABI-NEXT:    mov r2, r3
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _cosl
+; WATCHABI-NEXT:    add.w r9, r4, #16
+; WATCHABI-NEXT:    stm.w r9, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    mov r0, r7
+; WATCHABI-NEXT:    mov r1, r6
+; WATCHABI-NEXT:    mov r2, r5
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _sinl
+; WATCHABI-NEXT:    stm r4!, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a)
   ret { fp128, fp128 } %result
 }

From 70ff2c9a32ec2fa123869be9ca6efdd2f24fca1f Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 3 Nov 2025 18:28:33 -0800
Subject: [PATCH 114/313] [doc][RISCV] Add XSfvfexp* and XSfvfexpa* into
 RISCVUsage.rst (#166198)

They were introduced in #164349
---
 llvm/docs/RISCVUsage.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 49184e3104868..d03f383a92b3b 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -406,6 +406,12 @@ The current vendor extensions supported are:
 ``XSfvcp``
   LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive.  All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above.
 
+``Xsfvfexp16e``, ``Xsfvfbfexp16e``, and ``Xsfvfexp32e``
+  LLVM implements `version 0.5 of the Vector Exponential Extension Specification <https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above.
+
+``Xsfvfexpa`` and ``Xsfvfexpa64e``
+  LLVM implements `version 0.2 of the Vector Exponential Approximation Extension Specification <https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above.
+
 ``XSfvqmaccdod``, ``XSfvqmaccqoq``
   LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive.  All instructions are prefixed with `sf.` as described in the specification linked above.
 

From c63cb50080fb9c8a0650234cb711823db306f5c6 Mon Sep 17 00:00:00 2001
From: Jianjian Guan <jacquesguan@me.com>
Date: Tue, 4 Nov 2025 10:53:17 +0800
Subject: [PATCH 115/313] [RISCV][GISel] Support select indexed vector load
 store intrinsics (#165876)

---
 .../RISCV/GISel/RISCVInstructionSelector.cpp  |  125 +-
 .../RISCV/GlobalISel/rvv/vloxei-rv64.ll       | 1341 +++++
 .../CodeGen/RISCV/GlobalISel/rvv/vloxei.ll    | 5100 +++++++++++++++++
 .../RISCV/GlobalISel/rvv/vluxei-rv64.ll       | 1341 +++++
 .../CodeGen/RISCV/GlobalISel/rvv/vluxei.ll    | 5100 +++++++++++++++++
 .../RISCV/GlobalISel/rvv/vsoxei-rv64.ll       | 1293 +++++
 .../CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll    | 4881 ++++++++++++++++
 .../RISCV/GlobalISel/rvv/vsuxei-rv64.ll       | 1310 +++++
 .../CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll    | 4881 ++++++++++++++++
 9 files changed, 25368 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 282cf5d681685..3d5a55c631301 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -95,7 +95,8 @@ class RISCVInstructionSelector : public InstructionSelector {
   void addVectorLoadStoreOperands(MachineInstr &I,
                                   SmallVectorImpl<SrcOp> &SrcOps,
                                   unsigned &CurOp, bool IsMasked,
-                                  bool IsStrided) const;
+                                  bool IsStridedOrIndexed,
+                                  LLT *IndexVT = nullptr) const;
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineIRBuilder &MIB) const;
 
@@ -722,15 +723,17 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
 
 void RISCVInstructionSelector::addVectorLoadStoreOperands(
     MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp,
-    bool IsMasked, bool IsStrided) const {
+    bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const {
   // Base Pointer
   auto PtrReg = I.getOperand(CurOp++).getReg();
   SrcOps.push_back(PtrReg);
 
-  // Stride
-  if (IsStrided) {
+  // Stride or Index
+  if (IsStridedOrIndexed) {
     auto StrideReg = I.getOperand(CurOp++).getReg();
     SrcOps.push_back(StrideReg);
+    if (IndexVT)
+      *IndexVT = MRI->getType(StrideReg);
   }
 
   // Mask
@@ -805,6 +808,70 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
   }
+  case Intrinsic::riscv_vloxei:
+  case Intrinsic::riscv_vloxei_mask:
+  case Intrinsic::riscv_vluxei:
+  case Intrinsic::riscv_vluxei_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask ||
+                    IntrinID == Intrinsic::riscv_vluxei_mask;
+    bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei ||
+                     IntrinID == Intrinsic::riscv_vloxei_mask;
+    LLT VT = MRI->getType(I.getOperand(0).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Result vector
+    const Register DstReg = I.getOperand(0).getReg();
+
+    // Sources
+    bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm;
+    unsigned CurOp = 2;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Passthru
+    if (HasPassthruOperand) {
+      auto PassthruReg = I.getOperand(CurOp++).getReg();
+      SrcOps.push_back(PassthruReg);
+    } else {
+      // Use NoRegister if there is no specified passthru.
+      SrcOps.push_back(Register());
+    }
+    LLT IndexVT;
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    RISCVVType::VLMUL IndexLMUL =
+        RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT));
+    unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+    if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+      reportFatalUsageError("The V extension does not support EEW=64 for index "
+                            "values when XLEN=32");
+    }
+    const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
+        IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+        static_cast<unsigned>(IndexLMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Policy
+    uint64_t Policy = RISCVVType::MASK_AGNOSTIC;
+    if (IsMasked)
+      Policy = I.getOperand(CurOp++).getImm();
+    PseudoMI.addImm(Policy);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
   case Intrinsic::riscv_vsm:
   case Intrinsic::riscv_vse:
   case Intrinsic::riscv_vse_mask:
@@ -847,6 +914,56 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
   }
+  case Intrinsic::riscv_vsoxei:
+  case Intrinsic::riscv_vsoxei_mask:
+  case Intrinsic::riscv_vsuxei:
+  case Intrinsic::riscv_vsuxei_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask ||
+                    IntrinID == Intrinsic::riscv_vsuxei_mask;
+    bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei ||
+                     IntrinID == Intrinsic::riscv_vsoxei_mask;
+    LLT VT = MRI->getType(I.getOperand(1).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Sources
+    unsigned CurOp = 1;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Store value
+    auto PassthruReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(PassthruReg);
+
+    LLT IndexVT;
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    RISCVVType::VLMUL IndexLMUL =
+        RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT));
+    unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+    if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+      reportFatalUsageError("The V extension does not support EEW=64 for index "
+                            "values when XLEN=32");
+    }
+    const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
+        IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+        static_cast<unsigned>(IndexLMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
   }
 }
 
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
new file mode 100644
index 0000000000000..5cb55f15c7c8c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
new file mode 100644
index 0000000000000..fafd45b7579e8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
new file mode 100644
index 0000000000000..916af2556c6a8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
new file mode 100644
index 0000000000000..8dd32a1d640dc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
new file mode 100644
index 0000000000000..4963d91a14988
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
@@ -0,0 +1,1293 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
new file mode 100644
index 0000000000000..7ea2e1734e5a2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
new file mode 100644
index 0000000000000..9bd272a368d20
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
@@ -0,0 +1,1310 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> splat (i1 true),
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
new file mode 100644
index 0000000000000..7cd15454d40b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}

From d470bfe0d46480ebf7f4a2e1dd37005894acb298 Mon Sep 17 00:00:00 2001
From: camc <69519329+camc@users.noreply.github.com>
Date: Tue, 4 Nov 2025 03:12:27 +0000
Subject: [PATCH 116/313] [clang] Mark labels referenced when used in named
 break or continue (#166033)

Fixes #166013

Marks labels that appear in a c2y named break or continue statement as
referenced to fix false-positive unused diagnostics.

---------

Co-authored-by: camc <pushy-crop-cartel@duck.com>
---
 clang/docs/ReleaseNotes.rst              |  2 ++
 clang/lib/Sema/SemaStmt.cpp              |  3 +++
 clang/test/Sema/labeled-break-continue.c | 18 +++++++++++++++---
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2c575cd1d4289..2e2c5198fa8f5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -395,6 +395,8 @@ Improvements to Clang's diagnostics
   that were previously incorrectly accepted in case of other irrelevant
   conditions are now consistently diagnosed, identical to C++ mode.
 
+- Fix false-positive unused label diagnostic when a label is used in a named break
+  or continue (#GH166013)
 - Clang now emits a diagnostic in case `vector_size` or `ext_vector_type`
   attributes are used with a negative size (#GH165463).
 
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index f39896336053e..5b3ef1adf38e3 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3281,6 +3281,9 @@ static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope,
                                             SourceLocation LabelLoc,
                                             bool IsContinue) {
   assert(Target && "not a named break/continue?");
+
+  Target->markUsed(S.Context);
+
   Scope *Found = nullptr;
   for (Scope *Scope = CurScope; Scope; Scope = Scope->getParent()) {
     if (Scope->isFunctionScope())
diff --git a/clang/test/Sema/labeled-break-continue.c b/clang/test/Sema/labeled-break-continue.c
index 78f81c484c3d5..6b4adc23dca8d 100644
--- a/clang/test/Sema/labeled-break-continue.c
+++ b/clang/test/Sema/labeled-break-continue.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c2y -verify -fsyntax-only -fblocks %s
-// RUN: %clang_cc1 -std=c23 -verify -fsyntax-only -fblocks -fnamed-loops %s
-// RUN: %clang_cc1 -x c++ -verify -fsyntax-only -fblocks -fnamed-loops %s
+// RUN: %clang_cc1 -std=c2y -verify -Wunused -fsyntax-only -fblocks %s
+// RUN: %clang_cc1 -std=c23 -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
+// RUN: %clang_cc1 -x c++ -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
 
 void f1() {
   l1: while (true) {
@@ -159,3 +159,15 @@ void f7() {
     continue d; // expected-error {{'continue' label does not name an enclosing loop}}
   }
 }
+
+void f8() {
+  l1: // no-warning
+  while (true) {
+    break l1;
+  }
+
+  l2: // no-warning
+  while (true) {
+    continue l2;
+  }
+}

From 2b45efe92013e1372a2d49f550dd226ff351c594 Mon Sep 17 00:00:00 2001
From: choikwa <5455710+choikwa@users.noreply.github.com>
Date: Mon, 3 Nov 2025 22:21:26 -0500
Subject: [PATCH 117/313] [AMDGPU] NFC, move testcase, only test output of
 promote-alloca with vector-combine (#166289)

---
 .../extract-insert-chain-to-shuffles.ll       | 567 ++++++++++++++++++
 1 file changed, 567 insertions(+)
 create mode 100644 llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll

diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll
new file mode 100644
index 0000000000000..4b551fad5b43a
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll
@@ -0,0 +1,567 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=vector-combine < %s | FileCheck -check-prefix=OPT %s
+
+; Generated from amdgpu-promote-alloca on array of vectors
+; VectorCombiner should recognize chain of extract-insert vectors
+; and turn them into one or two shuffles
+define amdgpu_kernel void @extract_insert_chain_to_shuffles(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
+; OPT-LABEL: define amdgpu_kernel void @extract_insert_chain_to_shuffles(
+; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[ALLOCA:%.*]] = freeze <128 x i8> poison
+; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> <i32 128, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
+; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
+; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
+; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
+; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
+; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
+; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
+; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
+; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
+; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
+; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
+; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
+; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
+; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
+; OPT-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17
+; OPT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18
+; OPT-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19
+; OPT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20
+; OPT-NEXT:    [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21
+; OPT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22
+; OPT-NEXT:    [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23
+; OPT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24
+; OPT-NEXT:    [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25
+; OPT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26
+; OPT-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27
+; OPT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28
+; OPT-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29
+; OPT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30
+; OPT-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31
+; OPT-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33
+; OPT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34
+; OPT-NEXT:    [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35
+; OPT-NEXT:    [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36
+; OPT-NEXT:    [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37
+; OPT-NEXT:    [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38
+; OPT-NEXT:    [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39
+; OPT-NEXT:    [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40
+; OPT-NEXT:    [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41
+; OPT-NEXT:    [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42
+; OPT-NEXT:    [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43
+; OPT-NEXT:    [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44
+; OPT-NEXT:    [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45
+; OPT-NEXT:    [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46
+; OPT-NEXT:    [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47
+; OPT-NEXT:    [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 128, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49
+; OPT-NEXT:    [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50
+; OPT-NEXT:    [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51
+; OPT-NEXT:    [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52
+; OPT-NEXT:    [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53
+; OPT-NEXT:    [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54
+; OPT-NEXT:    [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55
+; OPT-NEXT:    [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56
+; OPT-NEXT:    [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57
+; OPT-NEXT:    [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58
+; OPT-NEXT:    [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59
+; OPT-NEXT:    [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60
+; OPT-NEXT:    [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61
+; OPT-NEXT:    [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62
+; OPT-NEXT:    [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63
+; OPT-NEXT:    [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65
+; OPT-NEXT:    [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66
+; OPT-NEXT:    [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67
+; OPT-NEXT:    [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68
+; OPT-NEXT:    [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69
+; OPT-NEXT:    [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70
+; OPT-NEXT:    [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71
+; OPT-NEXT:    [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72
+; OPT-NEXT:    [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73
+; OPT-NEXT:    [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74
+; OPT-NEXT:    [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75
+; OPT-NEXT:    [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76
+; OPT-NEXT:    [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77
+; OPT-NEXT:    [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78
+; OPT-NEXT:    [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79
+; OPT-NEXT:    [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 128, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81
+; OPT-NEXT:    [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82
+; OPT-NEXT:    [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83
+; OPT-NEXT:    [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84
+; OPT-NEXT:    [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85
+; OPT-NEXT:    [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86
+; OPT-NEXT:    [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87
+; OPT-NEXT:    [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88
+; OPT-NEXT:    [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89
+; OPT-NEXT:    [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90
+; OPT-NEXT:    [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91
+; OPT-NEXT:    [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92
+; OPT-NEXT:    [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93
+; OPT-NEXT:    [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94
+; OPT-NEXT:    [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95
+; OPT-NEXT:    [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97
+; OPT-NEXT:    [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98
+; OPT-NEXT:    [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99
+; OPT-NEXT:    [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100
+; OPT-NEXT:    [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101
+; OPT-NEXT:    [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102
+; OPT-NEXT:    [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103
+; OPT-NEXT:    [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104
+; OPT-NEXT:    [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105
+; OPT-NEXT:    [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106
+; OPT-NEXT:    [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107
+; OPT-NEXT:    [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108
+; OPT-NEXT:    [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109
+; OPT-NEXT:    [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110
+; OPT-NEXT:    [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111
+; OPT-NEXT:    [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113
+; OPT-NEXT:    [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114
+; OPT-NEXT:    [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115
+; OPT-NEXT:    [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116
+; OPT-NEXT:    [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117
+; OPT-NEXT:    [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118
+; OPT-NEXT:    [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119
+; OPT-NEXT:    [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120
+; OPT-NEXT:    [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121
+; OPT-NEXT:    [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122
+; OPT-NEXT:    [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123
+; OPT-NEXT:    [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124
+; OPT-NEXT:    [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125
+; OPT-NEXT:    [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126
+; OPT-NEXT:    [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127
+; OPT-NEXT:    [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1
+; OPT-NEXT:    [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2
+; OPT-NEXT:    [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3
+; OPT-NEXT:    [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4
+; OPT-NEXT:    [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5
+; OPT-NEXT:    [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6
+; OPT-NEXT:    [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7
+; OPT-NEXT:    [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8
+; OPT-NEXT:    [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9
+; OPT-NEXT:    [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10
+; OPT-NEXT:    [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11
+; OPT-NEXT:    [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12
+; OPT-NEXT:    [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13
+; OPT-NEXT:    [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14
+; OPT-NEXT:    [[TMP271:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+; OPT-NEXT:    [[SUM:%.*]] = add <16 x i8> [[TMP271]], [[ADD]]
+; OPT-NEXT:    store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
+; OPT-NEXT:    ret void
+;
+entry:
+  %alloca = freeze <128 x i8> poison
+  %0 = extractelement <16 x i8> %in, i64 0
+  %1 = insertelement <128 x i8> %alloca, i8 %0, i32 0
+  %2 = extractelement <16 x i8> %in, i64 1
+  %3 = insertelement <128 x i8> %1, i8 %2, i32 1
+  %4 = extractelement <16 x i8> %in, i64 2
+  %5 = insertelement <128 x i8> %3, i8 %4, i32 2
+  %6 = extractelement <16 x i8> %in, i64 3
+  %7 = insertelement <128 x i8> %5, i8 %6, i32 3
+  %8 = extractelement <16 x i8> %in, i64 4
+  %9 = insertelement <128 x i8> %7, i8 %8, i32 4
+  %10 = extractelement <16 x i8> %in, i64 5
+  %11 = insertelement <128 x i8> %9, i8 %10, i32 5
+  %12 = extractelement <16 x i8> %in, i64 6
+  %13 = insertelement <128 x i8> %11, i8 %12, i32 6
+  %14 = extractelement <16 x i8> %in, i64 7
+  %15 = insertelement <128 x i8> %13, i8 %14, i32 7
+  %16 = extractelement <16 x i8> %in, i64 8
+  %17 = insertelement <128 x i8> %15, i8 %16, i32 8
+  %18 = extractelement <16 x i8> %in, i64 9
+  %19 = insertelement <128 x i8> %17, i8 %18, i32 9
+  %20 = extractelement <16 x i8> %in, i64 10
+  %21 = insertelement <128 x i8> %19, i8 %20, i32 10
+  %22 = extractelement <16 x i8> %in, i64 11
+  %23 = insertelement <128 x i8> %21, i8 %22, i32 11
+  %24 = extractelement <16 x i8> %in, i64 12
+  %25 = insertelement <128 x i8> %23, i8 %24, i32 12
+  %26 = extractelement <16 x i8> %in, i64 13
+  %27 = insertelement <128 x i8> %25, i8 %26, i32 13
+  %28 = extractelement <16 x i8> %in, i64 14
+  %29 = insertelement <128 x i8> %27, i8 %28, i32 14
+  %30 = extractelement <16 x i8> %in, i64 15
+  %31 = insertelement <128 x i8> %29, i8 %30, i32 15
+  %32 = extractelement <16 x i8> %in, i64 0
+  %33 = insertelement <128 x i8> %31, i8 %32, i32 16
+  %34 = extractelement <16 x i8> %in, i64 1
+  %35 = insertelement <128 x i8> %33, i8 %34, i32 17
+  %36 = extractelement <16 x i8> %in, i64 2
+  %37 = insertelement <128 x i8> %35, i8 %36, i32 18
+  %38 = extractelement <16 x i8> %in, i64 3
+  %39 = insertelement <128 x i8> %37, i8 %38, i32 19
+  %40 = extractelement <16 x i8> %in, i64 4
+  %41 = insertelement <128 x i8> %39, i8 %40, i32 20
+  %42 = extractelement <16 x i8> %in, i64 5
+  %43 = insertelement <128 x i8> %41, i8 %42, i32 21
+  %44 = extractelement <16 x i8> %in, i64 6
+  %45 = insertelement <128 x i8> %43, i8 %44, i32 22
+  %46 = extractelement <16 x i8> %in, i64 7
+  %47 = insertelement <128 x i8> %45, i8 %46, i32 23
+  %48 = extractelement <16 x i8> %in, i64 8
+  %49 = insertelement <128 x i8> %47, i8 %48, i32 24
+  %50 = extractelement <16 x i8> %in, i64 9
+  %51 = insertelement <128 x i8> %49, i8 %50, i32 25
+  %52 = extractelement <16 x i8> %in, i64 10
+  %53 = insertelement <128 x i8> %51, i8 %52, i32 26
+  %54 = extractelement <16 x i8> %in, i64 11
+  %55 = insertelement <128 x i8> %53, i8 %54, i32 27
+  %56 = extractelement <16 x i8> %in, i64 12
+  %57 = insertelement <128 x i8> %55, i8 %56, i32 28
+  %58 = extractelement <16 x i8> %in, i64 13
+  %59 = insertelement <128 x i8> %57, i8 %58, i32 29
+  %60 = extractelement <16 x i8> %in, i64 14
+  %61 = insertelement <128 x i8> %59, i8 %60, i32 30
+  %62 = extractelement <16 x i8> %in, i64 15
+  %63 = insertelement <128 x i8> %61, i8 %62, i32 31
+  %64 = extractelement <16 x i8> %in, i64 0
+  %65 = insertelement <128 x i8> %63, i8 %64, i32 32
+  %66 = extractelement <16 x i8> %in, i64 1
+  %67 = insertelement <128 x i8> %65, i8 %66, i32 33
+  %68 = extractelement <16 x i8> %in, i64 2
+  %69 = insertelement <128 x i8> %67, i8 %68, i32 34
+  %70 = extractelement <16 x i8> %in, i64 3
+  %71 = insertelement <128 x i8> %69, i8 %70, i32 35
+  %72 = extractelement <16 x i8> %in, i64 4
+  %73 = insertelement <128 x i8> %71, i8 %72, i32 36
+  %74 = extractelement <16 x i8> %in, i64 5
+  %75 = insertelement <128 x i8> %73, i8 %74, i32 37
+  %76 = extractelement <16 x i8> %in, i64 6
+  %77 = insertelement <128 x i8> %75, i8 %76, i32 38
+  %78 = extractelement <16 x i8> %in, i64 7
+  %79 = insertelement <128 x i8> %77, i8 %78, i32 39
+  %80 = extractelement <16 x i8> %in, i64 8
+  %81 = insertelement <128 x i8> %79, i8 %80, i32 40
+  %82 = extractelement <16 x i8> %in, i64 9
+  %83 = insertelement <128 x i8> %81, i8 %82, i32 41
+  %84 = extractelement <16 x i8> %in, i64 10
+  %85 = insertelement <128 x i8> %83, i8 %84, i32 42
+  %86 = extractelement <16 x i8> %in, i64 11
+  %87 = insertelement <128 x i8> %85, i8 %86, i32 43
+  %88 = extractelement <16 x i8> %in, i64 12
+  %89 = insertelement <128 x i8> %87, i8 %88, i32 44
+  %90 = extractelement <16 x i8> %in, i64 13
+  %91 = insertelement <128 x i8> %89, i8 %90, i32 45
+  %92 = extractelement <16 x i8> %in, i64 14
+  %93 = insertelement <128 x i8> %91, i8 %92, i32 46
+  %94 = extractelement <16 x i8> %in, i64 15
+  %95 = insertelement <128 x i8> %93, i8 %94, i32 47
+  %96 = extractelement <16 x i8> %in, i64 0
+  %97 = insertelement <128 x i8> %95, i8 %96, i32 48
+  %98 = extractelement <16 x i8> %in, i64 1
+  %99 = insertelement <128 x i8> %97, i8 %98, i32 49
+  %100 = extractelement <16 x i8> %in, i64 2
+  %101 = insertelement <128 x i8> %99, i8 %100, i32 50
+  %102 = extractelement <16 x i8> %in, i64 3
+  %103 = insertelement <128 x i8> %101, i8 %102, i32 51
+  %104 = extractelement <16 x i8> %in, i64 4
+  %105 = insertelement <128 x i8> %103, i8 %104, i32 52
+  %106 = extractelement <16 x i8> %in, i64 5
+  %107 = insertelement <128 x i8> %105, i8 %106, i32 53
+  %108 = extractelement <16 x i8> %in, i64 6
+  %109 = insertelement <128 x i8> %107, i8 %108, i32 54
+  %110 = extractelement <16 x i8> %in, i64 7
+  %111 = insertelement <128 x i8> %109, i8 %110, i32 55
+  %112 = extractelement <16 x i8> %in, i64 8
+  %113 = insertelement <128 x i8> %111, i8 %112, i32 56
+  %114 = extractelement <16 x i8> %in, i64 9
+  %115 = insertelement <128 x i8> %113, i8 %114, i32 57
+  %116 = extractelement <16 x i8> %in, i64 10
+  %117 = insertelement <128 x i8> %115, i8 %116, i32 58
+  %118 = extractelement <16 x i8> %in, i64 11
+  %119 = insertelement <128 x i8> %117, i8 %118, i32 59
+  %120 = extractelement <16 x i8> %in, i64 12
+  %121 = insertelement <128 x i8> %119, i8 %120, i32 60
+  %122 = extractelement <16 x i8> %in, i64 13
+  %123 = insertelement <128 x i8> %121, i8 %122, i32 61
+  %124 = extractelement <16 x i8> %in, i64 14
+  %125 = insertelement <128 x i8> %123, i8 %124, i32 62
+  %126 = extractelement <16 x i8> %in, i64 15
+  %127 = insertelement <128 x i8> %125, i8 %126, i32 63
+  %128 = extractelement <16 x i8> %in, i64 0
+  %129 = insertelement <128 x i8> %127, i8 %128, i32 64
+  %130 = extractelement <16 x i8> %in, i64 1
+  %131 = insertelement <128 x i8> %129, i8 %130, i32 65
+  %132 = extractelement <16 x i8> %in, i64 2
+  %133 = insertelement <128 x i8> %131, i8 %132, i32 66
+  %134 = extractelement <16 x i8> %in, i64 3
+  %135 = insertelement <128 x i8> %133, i8 %134, i32 67
+  %136 = extractelement <16 x i8> %in, i64 4
+  %137 = insertelement <128 x i8> %135, i8 %136, i32 68
+  %138 = extractelement <16 x i8> %in, i64 5
+  %139 = insertelement <128 x i8> %137, i8 %138, i32 69
+  %140 = extractelement <16 x i8> %in, i64 6
+  %141 = insertelement <128 x i8> %139, i8 %140, i32 70
+  %142 = extractelement <16 x i8> %in, i64 7
+  %143 = insertelement <128 x i8> %141, i8 %142, i32 71
+  %144 = extractelement <16 x i8> %in, i64 8
+  %145 = insertelement <128 x i8> %143, i8 %144, i32 72
+  %146 = extractelement <16 x i8> %in, i64 9
+  %147 = insertelement <128 x i8> %145, i8 %146, i32 73
+  %148 = extractelement <16 x i8> %in, i64 10
+  %149 = insertelement <128 x i8> %147, i8 %148, i32 74
+  %150 = extractelement <16 x i8> %in, i64 11
+  %151 = insertelement <128 x i8> %149, i8 %150, i32 75
+  %152 = extractelement <16 x i8> %in, i64 12
+  %153 = insertelement <128 x i8> %151, i8 %152, i32 76
+  %154 = extractelement <16 x i8> %in, i64 13
+  %155 = insertelement <128 x i8> %153, i8 %154, i32 77
+  %156 = extractelement <16 x i8> %in, i64 14
+  %157 = insertelement <128 x i8> %155, i8 %156, i32 78
+  %158 = extractelement <16 x i8> %in, i64 15
+  %159 = insertelement <128 x i8> %157, i8 %158, i32 79
+  %160 = extractelement <16 x i8> %in, i64 0
+  %161 = insertelement <128 x i8> %159, i8 %160, i32 80
+  %162 = extractelement <16 x i8> %in, i64 1
+  %163 = insertelement <128 x i8> %161, i8 %162, i32 81
+  %164 = extractelement <16 x i8> %in, i64 2
+  %165 = insertelement <128 x i8> %163, i8 %164, i32 82
+  %166 = extractelement <16 x i8> %in, i64 3
+  %167 = insertelement <128 x i8> %165, i8 %166, i32 83
+  %168 = extractelement <16 x i8> %in, i64 4
+  %169 = insertelement <128 x i8> %167, i8 %168, i32 84
+  %170 = extractelement <16 x i8> %in, i64 5
+  %171 = insertelement <128 x i8> %169, i8 %170, i32 85
+  %172 = extractelement <16 x i8> %in, i64 6
+  %173 = insertelement <128 x i8> %171, i8 %172, i32 86
+  %174 = extractelement <16 x i8> %in, i64 7
+  %175 = insertelement <128 x i8> %173, i8 %174, i32 87
+  %176 = extractelement <16 x i8> %in, i64 8
+  %177 = insertelement <128 x i8> %175, i8 %176, i32 88
+  %178 = extractelement <16 x i8> %in, i64 9
+  %179 = insertelement <128 x i8> %177, i8 %178, i32 89
+  %180 = extractelement <16 x i8> %in, i64 10
+  %181 = insertelement <128 x i8> %179, i8 %180, i32 90
+  %182 = extractelement <16 x i8> %in, i64 11
+  %183 = insertelement <128 x i8> %181, i8 %182, i32 91
+  %184 = extractelement <16 x i8> %in, i64 12
+  %185 = insertelement <128 x i8> %183, i8 %184, i32 92
+  %186 = extractelement <16 x i8> %in, i64 13
+  %187 = insertelement <128 x i8> %185, i8 %186, i32 93
+  %188 = extractelement <16 x i8> %in, i64 14
+  %189 = insertelement <128 x i8> %187, i8 %188, i32 94
+  %190 = extractelement <16 x i8> %in, i64 15
+  %191 = insertelement <128 x i8> %189, i8 %190, i32 95
+  %192 = extractelement <16 x i8> %in, i64 0
+  %193 = insertelement <128 x i8> %191, i8 %192, i32 96
+  %194 = extractelement <16 x i8> %in, i64 1
+  %195 = insertelement <128 x i8> %193, i8 %194, i32 97
+  %196 = extractelement <16 x i8> %in, i64 2
+  %197 = insertelement <128 x i8> %195, i8 %196, i32 98
+  %198 = extractelement <16 x i8> %in, i64 3
+  %199 = insertelement <128 x i8> %197, i8 %198, i32 99
+  %200 = extractelement <16 x i8> %in, i64 4
+  %201 = insertelement <128 x i8> %199, i8 %200, i32 100
+  %202 = extractelement <16 x i8> %in, i64 5
+  %203 = insertelement <128 x i8> %201, i8 %202, i32 101
+  %204 = extractelement <16 x i8> %in, i64 6
+  %205 = insertelement <128 x i8> %203, i8 %204, i32 102
+  %206 = extractelement <16 x i8> %in, i64 7
+  %207 = insertelement <128 x i8> %205, i8 %206, i32 103
+  %208 = extractelement <16 x i8> %in, i64 8
+  %209 = insertelement <128 x i8> %207, i8 %208, i32 104
+  %210 = extractelement <16 x i8> %in, i64 9
+  %211 = insertelement <128 x i8> %209, i8 %210, i32 105
+  %212 = extractelement <16 x i8> %in, i64 10
+  %213 = insertelement <128 x i8> %211, i8 %212, i32 106
+  %214 = extractelement <16 x i8> %in, i64 11
+  %215 = insertelement <128 x i8> %213, i8 %214, i32 107
+  %216 = extractelement <16 x i8> %in, i64 12
+  %217 = insertelement <128 x i8> %215, i8 %216, i32 108
+  %218 = extractelement <16 x i8> %in, i64 13
+  %219 = insertelement <128 x i8> %217, i8 %218, i32 109
+  %220 = extractelement <16 x i8> %in, i64 14
+  %221 = insertelement <128 x i8> %219, i8 %220, i32 110
+  %222 = extractelement <16 x i8> %in, i64 15
+  %223 = insertelement <128 x i8> %221, i8 %222, i32 111
+  %224 = extractelement <16 x i8> %in, i64 0
+  %225 = insertelement <128 x i8> %223, i8 %224, i32 112
+  %226 = extractelement <16 x i8> %in, i64 1
+  %227 = insertelement <128 x i8> %225, i8 %226, i32 113
+  %228 = extractelement <16 x i8> %in, i64 2
+  %229 = insertelement <128 x i8> %227, i8 %228, i32 114
+  %230 = extractelement <16 x i8> %in, i64 3
+  %231 = insertelement <128 x i8> %229, i8 %230, i32 115
+  %232 = extractelement <16 x i8> %in, i64 4
+  %233 = insertelement <128 x i8> %231, i8 %232, i32 116
+  %234 = extractelement <16 x i8> %in, i64 5
+  %235 = insertelement <128 x i8> %233, i8 %234, i32 117
+  %236 = extractelement <16 x i8> %in, i64 6
+  %237 = insertelement <128 x i8> %235, i8 %236, i32 118
+  %238 = extractelement <16 x i8> %in, i64 7
+  %239 = insertelement <128 x i8> %237, i8 %238, i32 119
+  %240 = extractelement <16 x i8> %in, i64 8
+  %241 = insertelement <128 x i8> %239, i8 %240, i32 120
+  %242 = extractelement <16 x i8> %in, i64 9
+  %243 = insertelement <128 x i8> %241, i8 %242, i32 121
+  %244 = extractelement <16 x i8> %in, i64 10
+  %245 = insertelement <128 x i8> %243, i8 %244, i32 122
+  %246 = extractelement <16 x i8> %in, i64 11
+  %247 = insertelement <128 x i8> %245, i8 %246, i32 123
+  %248 = extractelement <16 x i8> %in, i64 12
+  %249 = insertelement <128 x i8> %247, i8 %248, i32 124
+  %250 = extractelement <16 x i8> %in, i64 13
+  %251 = insertelement <128 x i8> %249, i8 %250, i32 125
+  %252 = extractelement <16 x i8> %in, i64 14
+  %253 = insertelement <128 x i8> %251, i8 %252, i32 126
+  %254 = extractelement <16 x i8> %in, i64 15
+  %255 = insertelement <128 x i8> %253, i8 %254, i32 127
+  %256 = insertelement <16 x i8> poison, i8 %160, i64 0
+  %257 = insertelement <16 x i8> %256, i8 %162, i64 1
+  %258 = insertelement <16 x i8> %257, i8 %164, i64 2
+  %259 = insertelement <16 x i8> %258, i8 %166, i64 3
+  %260 = insertelement <16 x i8> %259, i8 %168, i64 4
+  %261 = insertelement <16 x i8> %260, i8 %170, i64 5
+  %262 = insertelement <16 x i8> %261, i8 %172, i64 6
+  %263 = insertelement <16 x i8> %262, i8 %174, i64 7
+  %264 = insertelement <16 x i8> %263, i8 %176, i64 8
+  %265 = insertelement <16 x i8> %264, i8 %178, i64 9
+  %266 = insertelement <16 x i8> %265, i8 %180, i64 10
+  %267 = insertelement <16 x i8> %266, i8 %182, i64 11
+  %268 = insertelement <16 x i8> %267, i8 %184, i64 12
+  %269 = insertelement <16 x i8> %268, i8 %186, i64 13
+  %270 = insertelement <16 x i8> %269, i8 %188, i64 14
+  %271 = insertelement <16 x i8> %270, i8 %190, i64 15
+  %sum = add <16 x i8> %271, %add
+  store <16 x i8> %sum, ptr addrspace(3) %out, align 16
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="2,2" }

From bb563b196f0e70b2790cdfe2619fbd34f273b508 Mon Sep 17 00:00:00 2001
From: Kelvin Li <kli@ca.ibm.com>
Date: Mon, 3 Nov 2025 22:58:56 -0500
Subject: [PATCH 118/313] [OpenMP][AIX] Not to create symbolic links to
 libomp.so on AIX (#165890)

On AIX, it generates libomp for both static and dynamic. There is no
need to create symbolic links to libomp.so.

---------

Co-authored-by: Xing Xue <xingxue@outlook.com>
---
 openmp/runtime/src/CMakeLists.txt | 40 ++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 6ac047a833fe5..5dd7f4b33612d 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -254,23 +254,35 @@ set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 
 # Add symbolic links to libomp
 if(NOT WIN32)
-  add_custom_command(TARGET omp POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
-      libgomp${LIBOMP_LIBRARY_SUFFIX}
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
-      libiomp5${LIBOMP_LIBRARY_SUFFIX}
-    WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
-  )
-  if(LIBOMP_ENABLE_SHARED)
-    if(APPLE)
-      set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
-    else()
-      set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
-    endif()
+  if(AIX)
+    # On AIX, libomp.a is the name for both static and shared objects.
+    set(LIBOMP_AIX_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
     add_custom_command(TARGET omp POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libgomp${LIBOMP_AIX_SUFFIX}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libiomp5${LIBOMP_AIX_SUFFIX}
       WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
     )
+  else()
+    add_custom_command(TARGET omp POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libiomp5${LIBOMP_LIBRARY_SUFFIX}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libgomp${LIBOMP_LIBRARY_SUFFIX}
+      WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+    )
+    if(LIBOMP_ENABLE_SHARED)
+      if(APPLE)
+        set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
+      else()
+        set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
+      endif()
+      add_custom_command(TARGET omp POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+        WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+      )
+    endif()
   endif()
 endif()
 

From 00ee53cc7b555ce408e6cd86378e3112bbdb0db8 Mon Sep 17 00:00:00 2001
From: Shoreshen <372660931@qq.com>
Date: Tue, 4 Nov 2025 12:26:17 +0800
Subject: [PATCH 119/313] [Attributor] Propagate alignment through ptrmask
 (#150158)

Propagate alignment through ptrmask based on potential constant values
of mask and align of ptr.

---------

Co-authored-by: Shilei Tian <i@tianshilei.me>
---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  11 +
 .../Transforms/IPO/AttributorAttributes.cpp   |  61 +++++
 .../X86/min-legal-vector-width.ll             | 230 ++++++++++++------
 .../Transforms/Attributor/align-ptrmask.ll    | 206 ++++++++++++++++
 .../Transforms/OpenMP/parallel_deletion.ll    |  14 +-
 5 files changed, 440 insertions(+), 82 deletions(-)
 create mode 100644 llvm/test/Transforms/Attributor/align-ptrmask.ll

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index a013f27766051..8c0342ae5cf12 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -5339,6 +5339,17 @@ struct AAPotentialConstantValues
     return nullptr;
   }
 
+  /// Return the minimum trailing zeros of potential constants
+  unsigned getAssumedMinTrailingZeros() const {
+    unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1;
+    for (const APInt &It : getAssumedSet()) {
+      if (It.countTrailingZeros() < TrailingZeros)
+        TrailingZeros = It.countTrailingZeros();
+    }
+    if (TrailingZeros > getAssumedSet().begin()->getBitWidth())
+      return 0;
+    return TrailingZeros;
+  }
   /// See AbstractAttribute::getName()
   StringRef getName() const override { return "AAPotentialConstantValues"; }
 
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 5ed47aec08b25..a6ac7610a2c7a 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final
 // ------------------------ Align Argument Attribute ------------------------
 
 namespace {
+
 static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
                                     Value &AssociatedValue, const Use *U,
                                     const Instruction *I, bool &TrackUse) {
@@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
       TrackUse = true;
     return 0;
   }
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::ptrmask: {
+      // Is it appropriate to pull attribute in initialization?
+      const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+          QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE);
+      const auto *AlignAA = A.getAAFor<AAAlign>(
+          QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+      if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) {
+        unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(),
+                                       Value::MaxAlignmentExponent);
+        Align ConstAlign(UINT64_C(1) << ShiftValue);
+        if (ConstAlign >= AlignAA->getKnownAlign())
+          return Align(1).value();
+      }
+      if (AlignAA)
+        return AlignAA->getKnownAlign().value();
+      break;
+    }
+    default:
+      break;
+    }
 
   MaybeAlign MA;
   if (const auto *CB = dyn_cast<CallBase>(I)) {
@@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final
   AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
       : Base(IRP, A) {}
 
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *I = getIRPosition().getCtxI();
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::ptrmask: {
+        Align Alignment;
+        bool Valid = false;
+
+        const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+            *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED);
+        if (ConstVals && ConstVals->isValidState()) {
+          unsigned ShiftValue =
+              std::min(ConstVals->getAssumedMinTrailingZeros(),
+                       Value::MaxAlignmentExponent);
+          Alignment = Align(UINT64_C(1) << ShiftValue);
+          Valid = true;
+        }
+
+        const auto *AlignAA =
+            A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+                                DepClassTy::REQUIRED);
+        if (AlignAA && AlignAA->isValidState()) {
+          Alignment = std::max(AlignAA->getAssumedAlign(), Alignment);
+          Valid = true;
+        }
+
+        if (Valid)
+          return clampStateAndIndicateChange<StateType>(
+              this->getState(),
+              std::min(this->getAssumedAlign(), Alignment).value());
+        break;
+      }
+      default:
+        break;
+      }
+    }
+    return Base::updateImpl(A);
+  };
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
 };
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
index 649e9467c0318..fffe50fde1e50 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -9,15 +9,25 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0:![0-9]+]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -66,15 +76,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -123,15 +143,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -180,15 +210,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -237,13 +277,21 @@ bb:
 ; This should not promote
 define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -290,13 +338,21 @@ bb:
 ; This should not promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg, ptr readonly %arg1) #2 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -343,15 +399,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -400,15 +466,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -464,6 +540,14 @@ attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2
 attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" }
 attributes #5 = { argmemonly nounwind }
 ;.
+; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" }
+; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
+; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn }
+;.
 ; TUNIT: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
 ; TUNIT: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
 ; TUNIT: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
@@ -472,11 +556,7 @@ attributes #5 = { argmemonly nounwind }
 ; TUNIT: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
 ; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind willreturn }
 ;.
-; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" }
-; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
-; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn }
+; CGSCC: [[META0]] = !{}
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/align-ptrmask.ll b/llvm/test/Transforms/Attributor/align-ptrmask.ll
new file mode 100644
index 0000000000000..008f5e1b8a46e
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/align-ptrmask.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=attributor -S < %s | FileCheck %s
+
+define ptr @align_ptrmask_back_no_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_no_prop(
+; CHECK-SAME: ptr nofree writeonly align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 16 dereferenceable(4) ptr @align_ptrmask_back_prop(
+; CHECK-SAME: ptr nofree writeonly align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 16 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 16
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 16
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_mask(
+; CHECK-SAME: ptr nofree readnone align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_ptr(ptr align 16 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_forward_ptr(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_nonconst_mask(ptr align 8 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_nonconst_mask(
+; CHECK-SAME: ptr nofree readnone align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 %y
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_nonconst_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_nonconst_mask(
+; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 %y
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_back_noprop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_noprop(
+; CHECK-SAME: ptr nofree writeonly align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_back_prop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_prop(
+; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -2) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -2)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_forward_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_back_const_forward_mask(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_forward_ptr(ptr align 16 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_const_forward_ptr(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  ret ptr %p
+}
+
+; FIXME: The store will create AAAlign for %ptr1,
+; but the attribute didn't propagate through extractelement, need propagate
+define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> align 2 %ptr, i64 %a) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_v2p0_v2i64(
+; CHECK-SAME: <2 x ptr> align 2 [[PTR:%.*]], i64 [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PTR]], <2 x i64> noundef splat (i64 -8)) #[[ATTR4]]
+; CHECK-NEXT:    [[PTR1:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 0
+; CHECK-NEXT:    [[PTR2:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 1
+; CHECK-NEXT:    store i64 [[A]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    store i64 [[A]], ptr [[PTR2]], align 16
+; CHECK-NEXT:    ret <2 x ptr> [[RESULT]]
+;
+  %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> splat(i64 -8))
+  %ptr1 = extractelement <2 x ptr> %result, i32 0
+  %ptr2 = extractelement <2 x ptr> %result, i32 1
+  store i64 %a, ptr %ptr1, align 16
+  store i64 %a, ptr %ptr2, align 16
+  ret <2 x ptr> %result
+}
+
+define ptr @align_ptrmask_forward_mask_positive(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_positive(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef 2) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 2)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_poison(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_poison(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 poison) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 poison)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_max(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4294967296) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4294967296)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_max_plus_one(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max_plus_one(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8589934592) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8589934592)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_callsite(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_callsite(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4)
+  ret ptr %p
+}
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 67970c41f765e..0b6c4f32772f5 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -385,7 +385,7 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
 ; CHECK-SAME: (ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
@@ -458,7 +458,7 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-SAME: (ptr noalias nofree readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]]
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
@@ -534,7 +534,7 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    store i32 1, ptr [[A1]], align 4
 ; CHECK-NEXT:    store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
@@ -646,10 +646,10 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
 ; CHECK-SAME: (ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG:%.*]], ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void

From a3a99c3996ffa2abf7e2b1e4abeaa933830f2ac3 Mon Sep 17 00:00:00 2001
From: quic-k <kushpal@qti.qualcomm.com>
Date: Tue, 4 Nov 2025 10:05:31 +0530
Subject: [PATCH 120/313] [compiler-rt][x86] Don't use assert.h when building
 without a libc (#165384)

fixes https://github.com/llvm/llvm-project/issues/164932

Signed-off-by: Kushal Pal <kushpal@qti.qualcomm.com>
Co-authored-by: Saleem Abdulrasool <compnerd@compnerd.org>
---
 compiler-rt/lib/builtins/cpu_model/x86.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index c21b2bad1d212..45b7055abf454 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -21,7 +21,9 @@
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 
+#if __STDC_HOSTED__
 #include <assert.h>
+#endif // __STDC_HOSTED__
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
 #include <cpuid.h>
@@ -245,8 +247,8 @@ struct __processor_model {
   unsigned int __cpu_features[1];
 } __cpu_model = {0, 0, 0, {0}};
 
-static_assert(sizeof(__cpu_model) == 16,
-              "Wrong size of __cpu_model will result in ABI break");
+_Static_assert(sizeof(__cpu_model) == 16,
+               "Wrong size of __cpu_model will result in ABI break");
 
 // This code is copied from lib/Support/Host.cpp.
 // Changes to either file should be mirrored in the other.
@@ -1200,8 +1202,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   unsigned Vendor;
   unsigned Model, Family;
   unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
-  static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
-  static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
+  _Static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
+  _Static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
 
   // This function needs to run just once.
   if (__cpu_model.__cpu_vendor)
@@ -1234,9 +1236,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   } else
     __cpu_model.__cpu_vendor = VENDOR_OTHER;
 
+#if __STDC_HOSTED__
   assert(__cpu_model.__cpu_vendor < VENDOR_MAX);
   assert(__cpu_model.__cpu_type < CPU_TYPE_MAX);
   assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX);
+#endif // __STDC_HOSTED__
 
   return 0;
 }

From 57730f6cdb32654c54e0cd4d535ab7a3b68ef7ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 3 Nov 2025 18:37:50 -1000
Subject: [PATCH 121/313] [flang][cuda] Switch to inline ptx for barrier_arrive
 (#166261)

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 13 ++++++-------
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6ebd52dcd42ea..2fefad1447132 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -3386,13 +3386,12 @@ IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
   assert(args.size() == 2);
   mlir::Value barrier = convertPtrToNVVMSpace(
       builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
-  // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
-  // currently just the sink symbol `_`.
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-  mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
-                                               {});
-  return fir::LoadOp::create(builder, loc, token);
+  return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
+                                         {barrier, args[1]}, {},
+                                         "mbarrier.arrive.expect_tx.release."
+                                         "cta.shared::cta.b64 %0, [%1], %2;",
+                                         {})
+      .getResult(0);
 }
 
 // BARRIER_INIT (CUDA)
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 666c394ad6678..038aa0a058277 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -444,7 +444,7 @@ end subroutine
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64
 
 
 attributes(global) subroutine test_fence()

From 95d6caa5d43688d60874cd16eb6c06605000f2a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 3 Nov 2025 18:40:18 -1000
Subject: [PATCH 122/313] [flang][cuda] Add interfaces and lowering for
 atomicaddvector (#166275)

---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |  2 +
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 49 +++++++++++++++++++
 flang/module/cudadevice.f90                   | 16 ++++++
 flang/test/Lower/CUDA/cuda-atomicadd.cuf      | 19 +++++++
 4 files changed, 86 insertions(+)
 create mode 100644 flang/test/Lower/CUDA/cuda-atomicadd.cuf

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 9f15ce68eb3d5..bbdef481a2085 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -190,6 +190,8 @@ struct IntrinsicLibrary {
   mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAtomicAddR2(mlir::Type,
                                     llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genAtomicAddVector(mlir::Type,
+                                        llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAtomicCas(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 2fefad1447132..b9ea8b125b780 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -290,6 +290,14 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
+    {"atomicadd_r2x2",
+     &I::genAtomicAddVector,
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicadd_r4x2",
+     &I::genAtomicAddVector,
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
     {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
@@ -3168,6 +3176,47 @@ IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
                                          mlir::ArrayRef<int64_t>{0});
 }
 
+fir::ExtendedValue
+IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType,
+                                     llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(
+      builder, loc, fir::SequenceType::get({2}, resultType));
+  mlir::Value a = fir::getBase(args[0]);
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+  auto vecTy = mlir::VectorType::get({2}, resultType);
+  auto refTy = fir::ReferenceType::get(resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Type idxTy = builder.getIndexType();
+  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+  mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), zero);
+  mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), one);
+  mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord);
+  mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord);
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
+  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
+  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+  mlir::Value add =
+      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
+  mlir::Value r1 = mlir::LLVM::ExtractElementOp::create(
+      builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0));
+  mlir::Value r2 = mlir::LLVM::ExtractElementOp::create(
+      builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1));
+  mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero);
+  mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one);
+  fir::StoreOp::create(builder, loc, r1, c1);
+  fir::StoreOp::create(builder, loc, r2, c2);
+  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
 mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
                                            llvm::ArrayRef<mlir::Value> args) {
   assert(args.size() == 2);
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 7a764b589dc56..b1aef95cba8c9 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1178,6 +1178,22 @@ attributes(device) pure integer(4) function atomicaddr2(address, val)
     end function
   end interface
 
+  interface atomicaddvector
+    attributes(device) pure function atomicadd_r2x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(2), dimension(2), intent(inout) :: address
+      real(2), dimension(2), intent(in) :: val
+      real(2), dimension(2) :: z
+    end function
+
+    attributes(device) pure function atomicadd_r4x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(2), intent(inout) :: address
+      real(4), dimension(2), intent(in) :: val
+      real(4), dimension(2) :: z
+    end function
+  end interface
+
   interface atomicsub
     attributes(device) pure integer function atomicsubi(address, val)
   !dir$ ignore_tkr (d) address, (d) val
diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
new file mode 100644
index 0000000000000..baa6cdb3d5869
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
@@ -0,0 +1,19 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran atmoicadd functions available cudadevice module
+
+attributes(global) subroutine atomicaddvector_r2()
+  real(2), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicAddVector(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
+
+attributes(global) subroutine atomicaddvector_r4()
+  real(4), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicAddVector(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPatomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>

From 993a38fa539d23f83711a0e07d3cc40a0947ec7e Mon Sep 17 00:00:00 2001
From: Lee Wei <lee10202013@gmail.com>
Date: Mon, 3 Nov 2025 20:52:51 -0800
Subject: [PATCH 123/313] [MLIR][Affine] Extend getVectorReductionOp to support
 xor/maxnumf/minnumf (#163310)

This PR extends the `getVectorReductionOp` function, which is used by
the affine vectorizer, to also recognize and support
`xor/maxnumf/minnumf` reduction operations.
---
 .../Affine/Analysis/AffineAnalysis.cpp        |   5 +-
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |  10 +-
 .../Conversion/ConvertToSPIRV/vector.mlir     |  36 +++++++
 .../SuperVectorize/vectorize_reduction.mlir   | 100 ++++++++++++++++++
 4 files changed, 148 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index 4d2d8738aa4ad..3d1a73417d1ea 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -66,9 +66,10 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
           .Case([](arith::MaxSIOp) { return arith::AtomicRMWKind::maxs; })
           .Case([](arith::MinUIOp) { return arith::AtomicRMWKind::minu; })
           .Case([](arith::MaxUIOp) { return arith::AtomicRMWKind::maxu; })
+          .Case([](arith::XOrIOp) { return arith::AtomicRMWKind::xori; })
+          .Case([](arith::MaxNumFOp) { return arith::AtomicRMWKind::maxnumf; })
+          .Case([](arith::MinNumFOp) { return arith::AtomicRMWKind::minnumf; })
           .Default([](Operation *) -> std::optional<arith::AtomicRMWKind> {
-            // TODO: AtomicRMW supports other kinds of reductions this is
-            // currently not detecting, add those when the need arises.
             return std::nullopt;
           });
   if (!maybeKind)
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ae3423c40040d..daef0ba02100a 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -717,7 +717,15 @@ Value mlir::vector::getVectorReductionOp(arith::AtomicRMWKind op,
   case arith::AtomicRMWKind::ori:
     return vector::ReductionOp::create(builder, vector.getLoc(),
                                        CombiningKind::OR, vector);
-  // TODO: Add remaining reduction operations.
+  case arith::AtomicRMWKind::minnumf:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MINNUMF, vector);
+  case arith::AtomicRMWKind::maxnumf:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MAXNUMF, vector);
+  case arith::AtomicRMWKind::xori:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::XOR, vector);
   default:
     (void)emitOptionalError(loc, "Reduction operation type not supported");
     break;
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
index a75f30d57fa74..cd8cfc8736915 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -275,6 +275,42 @@ func.func @reduction_minimumf(%v : vector<3xf32>, %s: f32) -> f32 {
 
 // -----
 
+// CHECK-LABEL:   spirv.func @reduction_minnumf(
+// CHECK-SAME:      %[[V:.*]]: vector<3xf32>,
+// CHECK-SAME:      %[[S:.*]]: f32) -> f32 "None" {
+// CHECK:           %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+// CHECK:           %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+// CHECK:           %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+// CHECK:           %[[MIN0:.*]] = spirv.GL.FMin %[[S0]], %[[S1]] : f32
+// CHECK:           %[[MIN1:.*]] = spirv.GL.FMin %[[MIN0]], %[[S2]] : f32
+// CHECK:           %[[MIN2:.*]] = spirv.GL.FMin %[[MIN1]], %[[S]] : f32
+// CHECK:           spirv.ReturnValue %[[MIN2]] : f32
+// CHECK:         }
+func.func @reduction_minnumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <minnumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
+// CHECK-LABEL:   spirv.func @reduction_maxnumf(
+// CHECK-SAME:      %[[V:.*]]: vector<3xf32>,
+// CHECK-SAME:      %[[S:.*]]: f32) -> f32 "None" {
+// CHECK:           %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+// CHECK:           %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+// CHECK:           %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+// CHECK:           %[[MAX0:.*]] = spirv.GL.FMax %[[S0]], %[[S1]] : f32
+// CHECK:           %[[MAX1:.*]] = spirv.GL.FMax %[[MAX0]], %[[S2]] : f32
+// CHECK:           %[[MAX2:.*]] = spirv.GL.FMax %[[MAX1]], %[[S]] : f32
+// CHECK:           spirv.ReturnValue %[[MAX2]] : f32
+// CHECK:         }
+func.func @reduction_maxnumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <maxnumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
 // CHECK-LABEL: func @reduction_maxsi
 //  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
 //       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
index b616632a6fe24..b062736575ad7 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
@@ -243,6 +243,106 @@ func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>)
 // CHECK:         affine.store %[[final_red]], %{{.*}} : memref<256xi32>
 // CHECK:       }
 
+// -----
+
+func.func @vecdim_reduction_xori(%in: memref<256x512xi32>, %out: memref<256xi32>) {
+ %cst = arith.constant 0 : i32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
+     %xor = arith.xori %red_iter, %ld : i32
+     affine.yield %xor : i32
+   }
+   affine.store %final_red, %out[%i] : memref<256xi32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_xori(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xi32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xi32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0 : i32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : i32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xi32>, vector<128xi32>
+// CHECK:               %[[xor:.*]] = arith.xori %[[red_iter]], %[[ld]] : vector<128xi32>
+// CHECK:               affine.yield %[[xor]] : vector<128xi32>
+// CHECK:             }
+// CHECK:             %[[final_red:.*]] = vector.reduction <xor>, %[[vred]] : vector<128xi32> into i32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xi32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vecdim_reduction_minnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0xFF800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %min = arith.minnumf %red_iter, %ld : f32
+     affine.yield %min : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_minnumf(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xf32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xf32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0xFF800000 : f32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0x7FC00000> : vector<128xf32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : f32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32>
+// CHECK:               %[[min:.*]] = arith.minnumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:               affine.yield %[[min]] : vector<128xf32>
+// CHECK:             }
+// CHECK:             %[[red_scalar:.*]] = vector.reduction <minnumf>, %[[vred]] : vector<128xf32> into f32
+// CHECK:             %[[final_red:.*]] = arith.minnumf %[[red_scalar]], %[[cst]] : f32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vecdim_reduction_maxnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0xFF800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %max = arith.maxnumf %red_iter, %ld : f32
+     affine.yield %max : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_maxnumf(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xf32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xf32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0xFF800000 : f32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0xFFC00000> : vector<128xf32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : f32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32>
+// CHECK:               %[[max:.*]] = arith.maxnumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:               affine.yield %[[max]] : vector<128xf32>
+// CHECK:             }
+// CHECK:             %[[red_scalar:.*]] = vector.reduction <maxnumf>, %[[vred]] : vector<128xf32> into f32
+// CHECK:             %[[final_red:.*]] = arith.maxnumf %[[red_scalar]], %[[cst]] : f32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
 
 // -----
 

From 93e860e694770f52a9eeecda88ba11173c291ef8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 3 Nov 2025 21:09:56 -0800
Subject: [PATCH 124/313] IR: Remove null UseList checks in hasNUses methods
 (#165929)

There do not appear to be any cases where this is used.
This does introduce an odd asyemmtry where use_empty is not
equivalent to hasNUses(0).
---
 llvm/lib/IR/Value.cpp               |  8 -------
 llvm/unittests/IR/ConstantsTest.cpp | 36 +++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index b775cbb0c7920..95d61a987f6c1 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -148,18 +148,10 @@ void Value::destroyValueName() {
 }
 
 bool Value::hasNUses(unsigned N) const {
-  if (!UseList)
-    return N == 0;
-
-  // TODO: Disallow for ConstantData and remove !UseList check?
   return hasNItems(use_begin(), use_end(), N);
 }
 
 bool Value::hasNUsesOrMore(unsigned N) const {
-  // TODO: Disallow for ConstantData and remove !UseList check?
-  if (!UseList)
-    return N == 0;
-
   return hasNItemsOrMore(use_begin(), use_end(), N);
 }
 
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 6376165cbe766..9cb9e1236b2d1 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -29,13 +29,8 @@ TEST(ConstantsTest, UseCounts) {
 
   EXPECT_TRUE(Zero->use_empty());
   EXPECT_EQ(Zero->getNumUses(), 0u);
-  EXPECT_TRUE(Zero->hasNUses(0));
   EXPECT_FALSE(Zero->hasOneUse());
   EXPECT_FALSE(Zero->hasOneUser());
-  EXPECT_FALSE(Zero->hasNUses(1));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(1));
-  EXPECT_FALSE(Zero->hasNUses(2));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(2));
 
   std::unique_ptr<Module> M(new Module("MyModule", Context));
 
@@ -50,15 +45,36 @@ TEST(ConstantsTest, UseCounts) {
   // Still looks like use_empty with uses.
   EXPECT_TRUE(Zero->use_empty());
   EXPECT_EQ(Zero->getNumUses(), 0u);
-  EXPECT_TRUE(Zero->hasNUses(0));
   EXPECT_FALSE(Zero->hasOneUse());
   EXPECT_FALSE(Zero->hasOneUser());
-  EXPECT_FALSE(Zero->hasNUses(1));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(1));
-  EXPECT_FALSE(Zero->hasNUses(2));
-  EXPECT_FALSE(Zero->hasNUsesOrMore(2));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+
+TEST(ConstantsTest, hasNUsesInvalid) {
+  LLVMContext Context;
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Constant *Zero = ConstantInt::get(Int32Ty, 0);
+  std::unique_ptr<Module> M(new Module("MyModule", Context));
+
+  // Introduce some uses
+  new GlobalVariable(*M, Int32Ty, /*isConstant=*/false,
+                     GlobalValue::ExternalLinkage, /*Initializer=*/Zero,
+                     "gv_user0");
+  new GlobalVariable(*M, Int32Ty, /*isConstant=*/false,
+                     GlobalValue::ExternalLinkage, /*Initializer=*/Zero,
+                     "gv_user1");
+
+  for (int I = 0; I != 3; ++I) {
+    EXPECT_DEATH(Zero->hasNUses(I), "hasUseList\\(\\)");
+    EXPECT_DEATH(Zero->hasNUsesOrMore(I), "hasUseList\\(\\)");
+  }
+}
+
+#endif
+#endif
+
 TEST(ConstantsTest, Integer_i1) {
   LLVMContext Context;
   IntegerType *Int1 = IntegerType::get(Context, 1);

From 0ba7bfc34fa292a34b31f6338464744b5cad58a3 Mon Sep 17 00:00:00 2001
From: Yingying Wang <3171290993@qq.com>
Date: Tue, 4 Nov 2025 13:25:49 +0800
Subject: [PATCH 125/313] [DFAJumpThreading] Enable DFAJumpThread by default.
 (#157646)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We recommend setting `dfa-jump-thread` to be enabled by default. It’s a
mature optimization that’s been supported since GCC 9.1.0. At the `-O2`
opt level, both the GCC and ICX compilers have this optimization enabled
by default.

Once it’s enabled, we saw a **13% performance improvement** in the
CoreMark benchmark on the X86 platform (Intel i9-11900K Rocket Lake),
and even a **15% increase** on the KunMingHu FPGA. Additionally, we
verified the correctness of this pass using SPEC 2017.

Co-authored-by: YinZd <104915588+nothiny@users.noreply.github.com>
Co-authored-by: ict-ql <168183727+ict-ql@users.noreply.github.com>
Co-authored-by: Lin Wang <wanglulin@ict.ac.cn>
---
 llvm/lib/Passes/PassBuilderPipelines.cpp                      | 2 +-
 llvm/test/Other/new-pm-defaults.ll                            | 1 +
 llvm/test/Other/new-pm-thinlto-postlink-defaults.ll           | 1 +
 llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll       | 1 +
 llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll | 1 +
 llvm/test/Other/new-pm-thinlto-prelink-defaults.ll            | 1 +
 llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll        | 1 +
 llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll  | 1 +
 8 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index bd03ac090721c..3f41618b18fcf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication(
 static cl::opt<bool>
     EnableDFAJumpThreading("enable-dfa-jump-thread",
                            cl::desc("Enable DFA jump threading"),
-                           cl::init(false), cl::Hidden);
+                           cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableHotColdSplit("hot-cold-split",
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 65b96c8b8ef5d..62975a3cf8ac4 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -208,6 +208,7 @@
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 3a0fffe426da1..012a1ab5802b5 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -133,6 +133,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 4623edcaf6656..e021ff3124b60 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -118,6 +118,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 590afd925e841..20f94bc2e0f6c 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -127,6 +127,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index dd6acd2c51ee7..b61edc805108d 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -165,6 +165,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index ee054527e20bd..acf8c053d0f82 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -167,6 +167,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index fd95e94f3c8b9..6b3c5ca7605de 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -131,6 +131,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass

From 52fdcf94a39a811476654e23176c6ffa69ff7287 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 4 Nov 2025 07:55:31 +0100
Subject: [PATCH 126/313] [RISCV][NFC] Match `3/5/9 * 3/5/9 * 2^N` without a
 loop (#165547)

#158851 matches `3/5/9 * 3/5/9` with a `switch`.
Reuse it for the shifted case to improve compilation time.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 99 +++++++++------------
 1 file changed, 43 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e0cf739f67d9b..c56ce3fd2a5a4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16495,6 +16495,35 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(Op, DL, VT, Shift1, Shift2);
 }
 
+static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
+                               unsigned ShY) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue X = N->getOperand(0);
+  SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                               DAG.getConstant(ShY, DL, VT), X);
+  return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
+                     DAG.getConstant(ShX, DL, VT), Mul359);
+}
+
+static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
+                                       uint64_t MulAmt) {
+  switch (MulAmt) {
+  case 5 * 3:
+    return getShlAddShlAdd(N, DAG, 2, 1);
+  case 9 * 3:
+    return getShlAddShlAdd(N, DAG, 3, 1);
+  case 5 * 5:
+    return getShlAddShlAdd(N, DAG, 2, 2);
+  case 9 * 5:
+    return getShlAddShlAdd(N, DAG, 3, 2);
+  case 9 * 9:
+    return getShlAddShlAdd(N, DAG, 3, 3);
+  default:
+    return SDValue();
+  }
+}
+
 // Try to expand a scalar multiply to a faster sequence.
 static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
@@ -16524,18 +16553,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
     return SDValue();
 
-  // WARNING: The code below is knowingly incorrect with regards to undef semantics.
-  // We're adding additional uses of X here, and in principle, we should be freezing
-  // X before doing so.  However, adding freeze here causes real regressions, and no
-  // other target properly freezes X in these cases either.
-  SDValue X = N->getOperand(0);
-
+  // WARNING: The code below is knowingly incorrect with regards to undef
+  // semantics.  We're adding additional uses of X here, and in principle, we
+  // should be freezing X before doing so.  However, adding freeze here causes
+  // real regressions, and no other target properly freezes X in these cases
+  // either.
   if (Subtarget.hasShlAdd(3)) {
+    SDValue X = N->getOperand(0);
     int Shift;
     if (int ShXAmount = isShifted359(MulAmt, Shift)) {
       // 3/5/9 * 2^N -> shl (shXadd X, X), N
       SDLoc DL(N);
-      SDValue X = N->getOperand(0);
       // Put the shift first if we can fold a zext into the shift forming
       // a slli.uw.
       if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
@@ -16554,38 +16582,8 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
     }
 
     // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
-    int ShX;
-    int ShY;
-    switch (MulAmt) {
-    case 3 * 5:
-      ShY = 1;
-      ShX = 2;
-      break;
-    case 3 * 9:
-      ShY = 1;
-      ShX = 3;
-      break;
-    case 5 * 5:
-      ShX = ShY = 2;
-      break;
-    case 5 * 9:
-      ShY = 2;
-      ShX = 3;
-      break;
-    case 9 * 9:
-      ShX = ShY = 3;
-      break;
-    default:
-      ShX = ShY = 0;
-      break;
-    }
-    if (ShX) {
-      SDLoc DL(N);
-      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getConstant(ShY, DL, VT), X);
-      return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                         DAG.getConstant(ShX, DL, VT), Mul359);
-    }
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt))
+      return V;
 
     // If this is a power 2 + 2/4/8, we can use a shift followed by a single
     // shXadd. First check if this a sum of two power of 2s because that's
@@ -16648,23 +16646,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
       }
     }
 
-    for (uint64_t Divisor : {3, 5, 9}) {
-      if (MulAmt % Divisor != 0)
-        continue;
-      uint64_t MulAmt2 = MulAmt / Divisor;
-      // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
-      // of 25 which happen to be quite common.
-      if (int ShBAmount = isShifted359(MulAmt2, Shift)) {
-        SDLoc DL(N);
-        SDValue Mul359A =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
-        SDValue Mul359B =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A,
-                        DAG.getConstant(ShBAmount, DL, VT), Mul359A);
-        return DAG.getNode(ISD::SHL, DL, VT, Mul359B,
-                           DAG.getConstant(Shift, DL, VT));
-      }
+    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+    // of 25 which happen to be quite common.
+    Shift = llvm::countr_zero(MulAmt);
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
     }
   }
 

From 546a783d81d31a6a2d83a8b92c88223756a16c8d Mon Sep 17 00:00:00 2001
From: shore <372660931@qq.com>
Date: Tue, 4 Nov 2025 15:01:18 +0800
Subject: [PATCH 127/313] [Attributor] Fix sanitizer for
 getAssumedMinTrailingZeros()

---
 llvm/include/llvm/Transforms/IPO/Attributor.h | 2 ++
 llvm/test/lit.cfg.py                          | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 8c0342ae5cf12..eb35e3644bd02 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -5341,6 +5341,8 @@ struct AAPotentialConstantValues
 
   /// Return the minimum trailing zeros of potential constants
   unsigned getAssumedMinTrailingZeros() const {
+    if (!isValidState() || getAssumedSet().empty())
+      return 0;
     unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1;
     for (const APInt &It : getAssumedSet()) {
       if (It.countTrailingZeros() < TrailingZeros)
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index cadf781b409be..89f1ca6935cff 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -584,7 +584,7 @@ def have_cxx_shared_library():
         print("could not exec llvm-readobj")
         return False
 
-    readobj_out = readobj_cmd.stdout.read().decode("ascii")
+    readobj_out = readobj_cmd.stdout.read().decode("utf-8")
     readobj_cmd.wait()
 
     regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")

From 04619db16ba840f1af2ad66f1d3eda9dc086a059 Mon Sep 17 00:00:00 2001
From: quic_hchandel <hchandel@qti.qualcomm.com>
Date: Tue, 4 Nov 2025 12:38:50 +0530
Subject: [PATCH 128/313] [RISCV] Add short forward branch support for `mul`
 instruction (#166300)

---
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |   2 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |   5 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   4 +
 llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td    |   1 +
 llvm/test/CodeGen/RISCV/features-info.ll      |   1 +
 .../RISCV/short-forward-branch-opt-mul.ll     | 156 ++++++++++++++++++
 6 files changed, 169 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 526675a682d86..b0453fc57c053 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoCCMAXU:
   case RISCV::PseudoCCMIN:
   case RISCV::PseudoCCMINU:
+  case RISCV::PseudoCCMUL:
   case RISCV::PseudoCCADDW:
   case RISCV::PseudoCCSUBW:
   case RISCV::PseudoCCSLL:
@@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCMIN:   NewOpc = RISCV::MIN;   break;
     case RISCV::PseudoCCMAXU:  NewOpc = RISCV::MAXU;  break;
     case RISCV::PseudoCCMINU:  NewOpc = RISCV::MINU;  break;
+    case RISCV::PseudoCCMUL:   NewOpc = RISCV::MUL;   break;
     case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;
     case RISCV::PseudoCCSLLI:  NewOpc = RISCV::SLLI;  break;
     case RISCV::PseudoCCSRLI:  NewOpc = RISCV::SRLI;  break;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index cfee6ab22d4ff..5b72334f58d45 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax
                        "true", "Enable short forward branch optimization for min,max instructions in Zbb",
                        [TuneShortForwardBranchOpt]>;
 
+def TuneShortForwardBranchIMul
+    : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul",
+                       "true", "Enable short forward branch optimization for mul instruction",
+                       [TuneShortForwardBranchOpt]>;
+
 // Some subtargets require a S2V transfer buffer to move scalars into vectors.
 // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
 def TuneNoSinkSplatOperands
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index c9df787e0012d..b8ab70bd9e386 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
   case RISCV::MAXU:  return RISCV::PseudoCCMAXU;
   case RISCV::MIN:   return RISCV::PseudoCCMIN;
   case RISCV::MINU:  return RISCV::PseudoCCMINU;
+  case RISCV::MUL:   return RISCV::PseudoCCMUL;
 
   case RISCV::ADDI:  return RISCV::PseudoCCADDI;
   case RISCV::SLLI:  return RISCV::PseudoCCSLLI;
@@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
        MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
     return nullptr;
 
+  if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL)
+    return nullptr;
+
   // Check if MI can be predicated and folded into the CCMOV.
   if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END)
     return nullptr;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 5a67a5aaba293..494b1c9f98839 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr;
 def PseudoCCMIN : SFBALU_rr;
 def PseudoCCMAXU : SFBALU_rr;
 def PseudoCCMINU : SFBALU_rr;
+def PseudoCCMUL : SFBALU_rr;
 
 def PseudoCCADDI : SFBALU_ri;
 def PseudoCCANDI : SFBALU_ri;
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 988d0490afeb6..cf44af608542c 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -137,6 +137,7 @@
 ; CHECK-NEXT:   shifted-zextw-fusion             - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
 ; CHECK-NEXT:   shlcofideleg                     - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
 ; CHECK-NEXT:   short-forward-branch-i-minmax    - Enable short forward branch optimization for min,max instructions in Zbb.
+; CHECK-NEXT:   short-forward-branch-i-mul       - Enable short forward branch optimization for mul instruction.
 ; CHECK-NEXT:   short-forward-branch-opt         - Enable short forward branch optimization.
 ; CHECK-NEXT:   shtvala                          - 'Shtvala' (htval provides all needed values).
 ; CHECK-NEXT:   shvsatpa                         - 'Shvsatpa' (vsatp supports all modes supported by satp).
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
new file mode 100644
index 0000000000000..3f780fddafcce
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBIMul-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBIMul-M
+
+define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-M-LABEL: select_example_mul_i32:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a1, a0, a3
+; RV32I-M-NEXT:  .LBB0_2: # %entry
+; RV32I-M-NEXT:    mv a0, a1
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i32:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB0_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mulw a1, a0, a3
+; RV64I-M-NEXT:  .LBB0_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i32:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a0, a0, a3
+; RV32I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a1
+; RV32I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i32:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV32I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a1
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFBIMul-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i32 %a, %y
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-M-LABEL: select_example_mul_i64:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a2, a0, a6
+; RV32I-M-NEXT:    mulhu a3, a0, a5
+; RV32I-M-NEXT:    mul a1, a1, a5
+; RV32I-M-NEXT:    add a2, a3, a2
+; RV32I-M-NEXT:    add a3, a2, a1
+; RV32I-M-NEXT:    mul a2, a0, a5
+; RV32I-M-NEXT:  .LBB1_2: # %entry
+; RV32I-M-NEXT:    mv a0, a2
+; RV32I-M-NEXT:    mv a1, a3
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i64:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mul a1, a0, a3
+; RV64I-M-NEXT:  .LBB1_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i64:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a6, a0, a6
+; RV32I-SFB-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFB-M-NEXT:    mul a1, a1, a5
+; RV32I-SFB-M-NEXT:    mul a0, a0, a5
+; RV32I-SFB-M-NEXT:    add a6, a7, a6
+; RV32I-SFB-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    add a3, a6, a1
+; RV32I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-M-NEXT:    bnez a4, .LBB1_4
+; RV32I-SFB-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a2
+; RV32I-SFB-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFB-M-NEXT:    mv a1, a3
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i64:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mul a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB1_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a6, a0, a6
+; RV32I-SFBIMul-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFBIMul-M-NEXT:    mul a1, a1, a5
+; RV32I-SFBIMul-M-NEXT:    add a6, a7, a6
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    add a3, a6, a1
+; RV32I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_4
+; RV32I-SFBIMul-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a2, a0, a5
+; RV32I-SFBIMul-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a2
+; RV32I-SFBIMul-M-NEXT:    mv a1, a3
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV64I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i64 %a, %y
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+

From 25a592cc63f0a252b8ae980271d0767a342a0a77 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar@nvidia.com>
Date: Tue, 4 Nov 2025 12:50:58 +0530
Subject: [PATCH 129/313] [MLIR][NVVM] Update redux.sync op (#166125)

This change:
- Updates the `redux.sync` NVVM Op input and output type constraints.
- Adds a verifier for the Op to prevent stack dumps and hitting
  `llvm_unreachable` in certain invalid usage scenarios.
  Instead, we gracefully error out with an informative
  message now.
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  8 +--
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    | 37 ++++++++++++++
 .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp  |  3 --
 .../LLVMIR/nvvm/redux-sync-invalid.mlir       | 49 +++++++++++++++++++
 4 files changed, 91 insertions(+), 6 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 46fdf5441bc13..9be108d5d1056 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -476,9 +476,9 @@ def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
 def ReduxKindAttr : EnumAttr<NVVM_Dialect, ReduxKind, "redux_kind">;
 
 def NVVM_ReduxOp :
-  NVVM_Op<"redux.sync", [NVVMRequiresSM<80>]>,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_Type:$val,
+  NVVM_Op<"redux.sync", [NVVMRequiresSM<80>, AllTypesMatch<["res", "val"]>]>,
+  Results<(outs AnyTypeOf<[I32, F32]>:$res)>,
+  Arguments<(ins AnyTypeOf<[I32, F32]>:$val,
                  ReduxKindAttr:$kind,
                  I32:$mask_and_clamp,
                  DefaultValuedAttr<BoolAttr, "false">:$abs,
@@ -496,6 +496,8 @@ def NVVM_ReduxOp :
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync)
   }];
+  let hasVerifier = 1;
+
   string llvmBuilder = [{
       auto intId = getReduxIntrinsicId($_resultType, $kind, $abs, $nan);
       $res = createIntrinsicCall(builder, intId, {$val, $mask_and_clamp});
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 12c81629d7e76..f2e55f255ceac 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1630,6 +1630,43 @@ LogicalResult NVVM::ClusterLaunchControlQueryCancelOp::verify() {
   return success();
 }
 
+LogicalResult NVVM::ReduxOp::verify() {
+  mlir::Type reduxType = getType();
+
+  if (!reduxType.isF32()) {
+    if (getAbs())
+      return emitOpError("abs attribute is supported only for f32 type");
+    if (getNan())
+      return emitOpError("nan attribute is supported only for f32 type");
+  }
+
+  NVVM::ReduxKind kind = getKind();
+  switch (kind) {
+  case NVVM::ReduxKind::ADD:
+  case NVVM::ReduxKind::AND:
+  case NVVM::ReduxKind::OR:
+  case NVVM::ReduxKind::XOR:
+  case NVVM::ReduxKind::MAX:
+  case NVVM::ReduxKind::MIN:
+  case NVVM::ReduxKind::UMAX:
+  case NVVM::ReduxKind::UMIN:
+    if (!reduxType.isInteger(32))
+      return emitOpError("'")
+             << stringifyEnum(kind) << "' redux kind unsupported with "
+             << reduxType << " type. Only supported type is 'i32'.";
+    break;
+  case NVVM::ReduxKind::FMIN:
+  case NVVM::ReduxKind::FMAX:
+    if (!reduxType.isF32())
+      return emitOpError("'")
+             << stringifyEnum(kind) << "' redux kind unsupported with "
+             << reduxType << " type. Only supported type is 'f32'.";
+    break;
+  }
+
+  return success();
+}
+
 /// Packs the given `field` into the `result`.
 /// The `result` is 64-bits and each `field` can be 32-bits or narrower.
 static llvm::Value *
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 3d86b09b32538..0964e1b8c5ef3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -36,9 +36,6 @@ using mlir::LLVM::detail::createIntrinsicCall;
 static llvm::Intrinsic::ID getReduxIntrinsicId(llvm::Type *resultType,
                                                NVVM::ReduxKind kind,
                                                bool hasAbs, bool hasNaN) {
-  if (!(resultType->isIntegerTy(32) || resultType->isFloatTy()))
-    llvm_unreachable("unsupported data type for redux");
-
   switch (kind) {
   case NVVM::ReduxKind::ADD:
     return llvm::Intrinsic::nvvm_redux_sync_add;
diff --git a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
new file mode 100644
index 0000000000000..a8a743006fbf8
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @redux_sync_i32_with_abs(%value: i32, %offset: i32) {
+  // expected-error@+1 {{abs attribute is supported only for f32 type}}
+  %res = nvvm.redux.sync add %value, %offset {abs = true}: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_i32_with_nan(%value: i32, %offset: i32) {
+  // expected-error@+1 {{nan attribute is supported only for f32 type}}
+  %res = nvvm.redux.sync add %value, %offset {nan = true}: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) {
+  // expected-error@+1 {{'add' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+  %res = nvvm.redux.sync add %value, %offset: f32 -> f32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) {
+  // expected-error@+1 {{'and' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+  %res = nvvm.redux.sync and %value, %offset: f32 -> f32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_i32_with_invalid_kind_fmin(%value: i32, %offset: i32) {
+  // expected-error@+1 {{'fmin' redux kind unsupported with 'i32' type. Only supported type is 'f32'.}}
+  %res = nvvm.redux.sync fmin %value, %offset: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_non_matching_types(%value: i32, %offset: i32) {
+  // expected-error@+1 {{failed to verify that all of {res, val} have same type}}
+  %res = nvvm.redux.sync add %value, %offset: i32 -> f32
+  llvm.return
+}

From 6ad25c5912fcf13b44fcc03bd6a66dc33348cd68 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 4 Nov 2025 07:50:51 +0000
Subject: [PATCH 130/313] [AArch64] Improve the cost model for extending mull
 (#125651)

We already have cost model code for detecting extending mull multiplies
for the form `mul(ext, ext)`. Since it was added the codegen for mull
has been improved, this attempts to catch the cost model up.

The main idea is to incorporate extends of larger sizes. A vector `v8i32
mul(zext(v8i8), zext(v8i8))` will be code-generated as `zext (v8i16
mul(zext(v8i8), zext(v8i8))`, or umull+ushll+ushll2.

So the total cost should be 3ish if each instruction costs 1. Where
exactly we attribute the costs is dependable, this patch opts to sets
the cost of the extend to 0 (or the cost of the extend not included in
the mull) and the mul gets the cost of the mull+extra extends.

isWideningInstruction is split into two functions for the two types of
operands it supports. isSingleExtWideningInstruction now handles addw
instructions that extend the second operand, isBinExtWideningInstruction
is for instructions like addl that extend both operands.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 158 ++++++--
 .../AArch64/AArch64TargetTransformInfo.h      |  14 +-
 .../CostModel/AArch64/arith-widening.ll       | 252 ++++++-------
 .../AArch64/fully-unrolled-cost.ll            |   2 +-
 .../partial-reduce-dot-product-mixed.ll       | 116 +++---
 .../partial-reduce-dot-product-neon.ll        |  16 +-
 .../AArch64/partial-reduce-dot-product.ll     | 351 +++++++-----------
 .../SLPVectorizer/AArch64/vecreduceadd.ll     |   4 +-
 8 files changed, 460 insertions(+), 453 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b5565afd62b1..10f2c80edc1b3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   llvm_unreachable("Unsupported register kind");
 }
 
-bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
-                                           ArrayRef<const Value *> Args,
-                                           Type *SrcOverrideTy) const {
+bool AArch64TTIImpl::isSingleExtWideningInstruction(
+    unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
+    Type *SrcOverrideTy) const {
   // A helper that returns a vector type from the given type. The number of
   // elements in type Ty determines the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
@@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
       (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
     return false;
 
-  // Determine if the operation has a widening variant. We consider both the
-  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
-  // instructions.
-  //
-  // TODO: Add additional widening operations (e.g., shl, etc.) once we
-  //       verify that their extending operands are eliminated during code
-  //       generation.
   Type *SrcTy = SrcOverrideTy;
   switch (Opcode) {
-  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
-  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+  case Instruction::Add:   // UADDW(2), SADDW(2).
+  case Instruction::Sub: { // USUBW(2), SSUBW(2).
     // The second operand needs to be an extend
     if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
       if (!SrcTy)
         SrcTy =
             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
-    } else
+      break;
+    }
+
+    if (Opcode == Instruction::Sub)
       return false;
-    break;
-  case Instruction::Mul: { // SMULL(2), UMULL(2)
-    // Both operands need to be extends of the same type.
-    if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
-        (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+
+    // UADDW(2), SADDW(2) can be commutted.
+    if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
       if (!SrcTy)
         SrcTy =
             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
-    } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
-      // If one of the operands is a Zext and the other has enough zero bits to
-      // be treated as unsigned, we can still general a umull, meaning the zext
-      // is free.
-      KnownBits Known =
-          computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
-      if (Args[0]->getType()->getScalarSizeInBits() -
-              Known.Zero.countLeadingOnes() >
-          DstTy->getScalarSizeInBits() / 2)
-        return false;
-      if (!SrcTy)
-        SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
-                                           DstTy->getScalarSizeInBits() / 2));
-    } else
-      return false;
-    break;
+      break;
+    }
+    return false;
   }
   default:
     return false;
@@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
 }
 
+Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                                  ArrayRef<const Value *> Args,
+                                                  Type *SrcOverrideTy) const {
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+      Opcode != Instruction::Mul)
+    return nullptr;
+
+  // Exit early if DstTy is not a vector type whose elements are one of [i16,
+  // i32, i64]. SVE doesn't generally have the same set of instructions to
+  // perform an extend with the add/sub/mul. There are SMULLB style
+  // instructions, but they operate on top/bottom, requiring some sort of lane
+  // interleaving to be used with zext/sext.
+  unsigned DstEltSize = DstTy->getScalarSizeInBits();
+  if (!useNeonVector(DstTy) || Args.size() != 2 ||
+      (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
+    return nullptr;
+
+  auto getScalarSizeWithOverride = [&](const Value *V) {
+    if (SrcOverrideTy)
+      return SrcOverrideTy->getScalarSizeInBits();
+    return cast<Instruction>(V)
+        ->getOperand(0)
+        ->getType()
+        ->getScalarSizeInBits();
+  };
+
+  unsigned MaxEltSize = 0;
+  if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
+      (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+    unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+    unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+    MaxEltSize = std::max(EltSize0, EltSize1);
+  } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
+             isa<SExtInst, ZExtInst>(Args[1])) {
+    unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+    unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+    // mul(sext, zext) will become smull(sext, zext) if the extends are large
+    // enough.
+    if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
+      return nullptr;
+    MaxEltSize = DstEltSize / 2;
+  } else if (Opcode == Instruction::Mul &&
+             (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
+    // If one of the operands is a Zext and the other has enough zero bits
+    // to be treated as unsigned, we can still generate a umull, meaning the
+    // zext is free.
+    KnownBits Known =
+        computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
+    if (Args[0]->getType()->getScalarSizeInBits() -
+            Known.Zero.countLeadingOnes() >
+        DstTy->getScalarSizeInBits() / 2)
+      return nullptr;
+
+    MaxEltSize =
+        getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
+  } else
+    return nullptr;
+
+  if (MaxEltSize * 2 > DstEltSize)
+    return nullptr;
+
+  Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
+  if (ExtTy->getPrimitiveSizeInBits() <= 64)
+    return nullptr;
+  return ExtTy;
+}
+
 // s/urhadd instructions implement the following pattern, making the
 // extends free:
 //   %x = add ((zext i8 -> i16), 1)
@@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
-    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
+    if (Type *ExtTy = isBinExtWideningInstruction(
+            SingleUser->getOpcode(), Dst, Operands,
+            Src != I->getOperand(0)->getType() ? Src : nullptr)) {
+      // The cost from Src->Src*2 needs to be added if required, the cost from
+      // Src*2->ExtTy is free.
+      if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
+        Type *DoubleSrcTy =
+            Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
+        return getCastInstrCost(Opcode, DoubleSrcTy, Src,
+                                TTI::CastContextHint::None, CostKind);
+      }
+
+      return 0;
+    }
+
+    if (isSingleExtWideningInstruction(
+            SingleUser->getOpcode(), Dst, Operands,
+            Src != I->getOperand(0)->getType() ? Src : nullptr)) {
       // For adds only count the second operand as free if both operands are
       // extends but not the same operation. (i.e both operands are not free in
       // add(sext, zext)).
@@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
             (isa<CastInst>(SingleUser->getOperand(1)) &&
              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
           return 0;
-      } else // Others are free so long as isWideningInstruction returned true.
+      } else {
+        // Others are free so long as isSingleExtWideningInstruction
+        // returned true.
         return 0;
+      }
     }
 
     // The cast will be free for the s/urhadd instructions
@@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
             }))
       return *PromotedCost;
 
+  // If the operation is a widening instruction (smull or umull) and both
+  // operands are extends the cost can be cheaper by considering that the
+  // operation will operate on the narrowest type size possible (double the
+  // largest input size) and a further extend.
+  if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
+    if (ExtTy != Ty)
+      return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
+             getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
+                              TTI::CastContextHint::None, CostKind);
+    return LT.first;
+  }
+
   switch (ISD) {
   default:
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     // - two 2-cost i64 inserts, and
     // - two 1-cost muls.
     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
-    // LT.first = 2 the cost is 28. If both operands are extensions it will not
-    // need to scalarize so the cost can be cheaper (smull or umull).
-    // so the cost can be cheaper (smull or umull).
-    if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+    // LT.first = 2 the cost is 28.
+    if (LT.second != MVT::v2i64)
       return LT.first;
     return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
            (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b39546a9a381d..e3b0a1bec53ec 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
-  bool isWideningInstruction(Type *DstTy, unsigned Opcode,
-                             ArrayRef<const Value *> Args,
-                             Type *SrcOverrideTy = nullptr) const;
+  /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern
+  /// where both operands can be treated like extends. Returns the minimal type
+  /// needed to compute the operation.
+  Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                    ArrayRef<const Value *> Args,
+                                    Type *SrcOverrideTy = nullptr) const;
+  /// Given a add/sub operation with a single extend operand, detect a
+  /// widening addw/subw pattern.
+  bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                      ArrayRef<const Value *> Args,
+                                      Type *SrcOverrideTy = nullptr) const;
 
   // A helper function called by 'getVectorInstrCost'.
   //
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
index 7e1588f427be4..76f73e43a2355 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
@@ -325,14 +325,14 @@ define void @extaddv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = add <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -434,24 +434,24 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = add <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -464,14 +464,14 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -573,24 +573,24 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = add <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -603,14 +603,14 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
@@ -1020,14 +1020,14 @@ define void @extsubv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = sub <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -1129,24 +1129,24 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = sub <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -1159,14 +1159,14 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -1268,24 +1268,24 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = sub <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -1298,14 +1298,14 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
@@ -1715,14 +1715,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -1824,24 +1824,24 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -1854,14 +1854,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -1963,24 +1963,24 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -1993,14 +1993,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 199203a9f5cb0..1164778c19070 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 16
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 26e630f969ef3..0ee6b52a2450b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -55,44 +55,36 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
-; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
-; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NOI8MM-NEXT:    [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -166,44 +158,36 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
-; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
-; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NOI8MM-NEXT:    [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -292,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -387,7 +371,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index b84763142b686..e74830700776c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -512,17 +512,23 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 4636c1b63da82..d77ca9875bf01 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -758,132 +758,87 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]]
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP8]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]]
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]]
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP8]]
 ;
 entry:
   br label %for.body
@@ -930,7 +885,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -968,7 +923,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
@@ -1000,7 +955,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1085,7 +1040,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
@@ -1183,7 +1138,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
@@ -1253,7 +1208,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]])
@@ -1350,7 +1305,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT:%.*]]
@@ -1388,7 +1343,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[EXIT:%.*]]
@@ -1426,7 +1381,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
@@ -1461,82 +1416,66 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]]
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1561,7 +1500,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1616,7 +1555,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
@@ -1651,7 +1590,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
@@ -1685,7 +1624,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1803,7 +1742,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -1915,7 +1854,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -1953,7 +1892,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -1968,31 +1907,27 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
-; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2048,7 +1983,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2086,7 +2021,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2101,31 +2036,27 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2186,7 +2117,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2232,7 +2163,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
@@ -2274,7 +2205,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2396,7 +2327,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2496,7 +2427,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2596,7 +2527,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
index c1a87f0c5f907..577efcbbac012 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
@@ -930,7 +930,7 @@ entry:
 
 
 ; COST-LABEL: Function:  mla_v8i8_i32
-; COST: Cost:            '-18'
+; COST: Cost:            '-24'
 define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
 ; CHECK-LABEL: @mla_v8i8_i32(
 ; CHECK-NEXT:  entry:
@@ -1009,7 +1009,7 @@ entry:
 
 
 ; COST-LABEL: Function:  mla_v16i8_i32
-; COST: Cost:            '-40'
+; COST: Cost:            '-52'
 define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
 ; CHECK-LABEL: @mla_v16i8_i32(
 ; CHECK-NEXT:  entry:

From f74e90961f51c9437461007c89b037be41e4e887 Mon Sep 17 00:00:00 2001
From: Maya Amrami <62667278+amrami@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:36:11 +0200
Subject: [PATCH 131/313] [mlir][memref]: Collapse strided unit dim even if
 strides are dynamic (#157330)

---
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 10 +++++-----
 mlir/test/Dialect/MemRef/ops.mlir        |  7 ++++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 1c21a2f270da6..e271ac58db327 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2568,6 +2568,11 @@ computeCollapsedLayoutMap(MemRefType srcType,
     auto trailingReassocs = ArrayRef<int64_t>(reassoc).drop_front();
     auto stride = SaturatedInteger::wrap(resultStrides[resultStrideIndex--]);
     for (int64_t idx : llvm::reverse(trailingReassocs)) {
+      // Dimensions of size 1 should be skipped, because their strides are
+      // meaningless and could have any arbitrary value.
+      if (srcShape[idx - 1] == 1)
+        continue;
+
       stride = stride * SaturatedInteger::wrap(srcShape[idx]);
 
       // Both source and result stride must have the same static value. In that
@@ -2582,11 +2587,6 @@ computeCollapsedLayoutMap(MemRefType srcType,
       if (strict && (stride.saturated || srcStride.saturated))
         return failure();
 
-      // Dimensions of size 1 should be skipped, because their strides are
-      // meaningless and could have any arbitrary value.
-      if (srcShape[idx - 1] == 1)
-        continue;
-
       if (!stride.saturated && !srcStride.saturated && stride != srcStride)
         return failure();
     }
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index a90c9505a8405..b1db99bb3ad08 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -440,7 +440,8 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
          %arg4: index,
          %arg5: index,
          %arg6: index,
-         %arg7: memref<4x?x4xf32>) {
+         %arg7: memref<4x?x4xf32>,
+         %arg8: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>>) {
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]]
 //  CHECK-SAME:     memref<?x?x?xf32> into memref<?x?xf32>
   %0 = memref.collapse_shape %arg0 [[0, 1], [2]] :
@@ -489,6 +490,10 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
 //       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]]
   %4 = memref.expand_shape %arg7 [[0, 1], [2], [3, 4]] output_shape [2, 2, %arg4, 2, 2]
         : memref<4x?x4xf32> into memref<2x2x?x2x2xf32>
+
+//       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2], [3]]
+//  CHECK-SAME:     memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>>
+  %5 = memref.collapse_shape %arg8 [[0, 1], [2], [3]] : memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>>
   return
 }
 

From c02bdd466a1c22221bc6de3b6817945c90979351 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Tue, 4 Nov 2025 10:22:13 +0100
Subject: [PATCH 132/313] [AMDGPU] Fix handling of FP in cs.chain functions
 (#161194)

In case there is an dynamic alloca / an alloca which is not in the entry
block, cs.chain functions do not set up an FP, but are reported to need
one. This results in a failed assertion in
`SIFrameLowering::emitPrologue()` (Assertion `(!HasFP || FPSaved) &&
"Needed to save FP but didn't save it anywhere"' failed.) This commit
changes `hasFPImpl` so that the need for an SP in a cs.chain function
does not directly imply the need for an FP anymore.

This LLVM defect was identified via the AMD Fuzzing project.
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |   4 +-
 .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll       | 519 ++++++++++++++++++
 2 files changed, 522 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5c39f7a3d6daa..aa5ea77f17291 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
     return MFI.getStackSize() != 0;
   }
 
-  return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+  return (frameTriviallyRequiresSP(MFI) &&
+          !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+         MFI.isFrameAddressTaken() ||
          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
              MF) ||
          mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
new file mode 100644
index 0000000000000..06150e4277e9a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s
+
+; These situations are "special" in that they either have an alloca that is not
+; in the entry block or that they have a dynamic alloca. Both situations affect
+; prolog/epilog generation.
+
+declare amdgpu_gfx void @foo()
+
+define amdgpu_cs_chain void @test_alloca() {
+; GFX12-LABEL: test_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s0, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s0, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s0, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 6
+; GFX942-NEXT:    s_mov_b32 s1, s32
+; GFX942-NEXT:    s_add_i32 s32, s1, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, s1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
+; GFX12-LABEL: test_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call() {
+; GFX12-LABEL: test_alloca_and_call:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s2, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s2, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s2
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s2, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s2, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_and_call_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    s_add_i32 s32, s3, s2
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
+; GFX12-LABEL: test_alloca_and_call_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca() {
+; GFX12-LABEL: test_call_and_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, 0x200
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    s_add_i32 s32, s4, 0x400
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    scratch_store_dword off, v0, s4
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_call_and_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_add_i32 s32, s4, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
+; GFX12-LABEL: test_call_and_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, s0, 5, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_lshl_add_u32 v0, s2, 6, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}

From bb4ed55acdbc7f48bc978147189e8106e3ea42f8 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 4 Nov 2025 09:25:28 +0000
Subject: [PATCH 133/313] [lldb][Runtime] Move VerboseTrapFrameRecognizer into
 CPPLanguageRuntime (#166157)

https://github.com/llvm/llvm-project/pull/165996 is adding a Clang
dependency to Target because we're moving some of the functionality of
the VerboseTrapFrameRecognizer into libClang. To avoid adding this
dependency this patch moves VerboseTrapFrameRecognizer into the
CPPLanguageRuntime. Most of the frame recognizers already live in the
various runtime plugins.

An alternative discussed was to create a common `CLanguageRuntime` whose
currently sole responsibility was to register the
`VerboseTrapFrameRecognizer` and `AssertStackFrameRecognizer`. The main
issue I ran into here was frame recognizers aren't uniqued in the
target. Currently this only manifests when re-running a target, which
re-triggers all the recognizer registration (added a test with a FIXME
for this). If we had a common `CLanguageRuntime` that
`CPPLanguageRuntime` and `ObjCLanguageRuntime` inherited from, I didn't
find a great way to avoid registering the recognizer multiple times. We
can't just call_once on it because we do want the recognisers to be
re-registered for new targets in the same debugger session. If the
recognisers were stored in something like a UniqueVector in the Target,
then we wouldn't have that issue. But currently that's not the case, and
it would take a bit of refactoring to de-dupe the recognisers.

There may very well be solutions I haven't considered, but all the
things I've tried so far I wasn't very happy with. So in the end I just
moved this to the C++ runtime for now in order to unblock
https://github.com/llvm/llvm-project/pull/165996.

The C++ language runtime is always available (even for C targets) if the
C++ language plugin is available. Which it should also be unless someone
is using an LLDB with the C++ plugin compiled out. But at that point
numerous things wouldn't work when even debugging just C.
---
 .../LanguageRuntime/CPlusPlus/CMakeLists.txt  |  1 +
 .../CPlusPlus/CPPLanguageRuntime.cpp          |  6 ++-
 .../CPlusPlus}/VerboseTrapFrameRecognizer.cpp |  2 +-
 .../CPlusPlus}/VerboseTrapFrameRecognizer.h   | 14 +++--
 lldb/source/Target/CMakeLists.txt             |  1 -
 lldb/source/Target/Process.cpp                |  2 -
 .../Shell/Recognizer/Inputs/verbose_trap.m    |  4 ++
 .../Shell/Recognizer/registration-unique.test | 54 +++++++++++++++++++
 .../Shell/Recognizer/verbose_trap-objc.test   | 12 +++++
 9 files changed, 88 insertions(+), 8 deletions(-)
 rename lldb/source/{Target => Plugins/LanguageRuntime/CPlusPlus}/VerboseTrapFrameRecognizer.cpp (98%)
 rename lldb/{include/lldb/Target => source/Plugins/LanguageRuntime/CPlusPlus}/VerboseTrapFrameRecognizer.h (63%)
 create mode 100644 lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
 create mode 100644 lldb/test/Shell/Recognizer/registration-unique.test
 create mode 100644 lldb/test/Shell/Recognizer/verbose_trap-objc.test

diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
index 1717b0a896669..a27bceffe2e3a 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_lldb_library(lldbPluginCPPRuntime
   CPPLanguageRuntime.cpp
+  VerboseTrapFrameRecognizer.cpp
 
   LINK_LIBS
     lldbCore
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
index 21a5ebe53073a..913678b629f2f 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
@@ -12,6 +12,7 @@
 #include <memory>
 
 #include "CPPLanguageRuntime.h"
+#include "VerboseTrapFrameRecognizer.h"
 
 #include "llvm/ADT/StringRef.h"
 
@@ -107,12 +108,15 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer {
 
 CPPLanguageRuntime::CPPLanguageRuntime(Process *process)
     : LanguageRuntime(process) {
-  if (process)
+  if (process) {
     process->GetTarget().GetFrameRecognizerManager().AddRecognizer(
         StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {},
         std::make_shared<RegularExpression>("^std::__[^:]*::"),
         /*mangling_preference=*/Mangled::ePreferDemangledWithoutArguments,
         /*first_instruction_only=*/false);
+
+    RegisterVerboseTrapFrameRecognizer(*process);
+  }
 }
 
 bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) {
diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
similarity index 98%
rename from lldb/source/Target/VerboseTrapFrameRecognizer.cpp
rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
index 03ab58b8c59a9..730aba5b42a3e 100644
--- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
@@ -1,4 +1,4 @@
-#include "lldb/Target/VerboseTrapFrameRecognizer.h"
+#include "VerboseTrapFrameRecognizer.h"
 
 #include "lldb/Core/Module.h"
 #include "lldb/Symbol/Function.h"
diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
similarity index 63%
rename from lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h
rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
index 7e045760a28be..7d7020f63c8d2 100644
--- a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
@@ -1,5 +1,13 @@
-#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
-#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
+//===-- VerboseTrapFrameRecognizer.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
+#define LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
 
 #include "lldb/Target/StackFrameRecognizer.h"
 
@@ -36,4 +44,4 @@ class VerboseTrapFrameRecognizer : public StackFrameRecognizer {
 
 } // namespace lldb_private
 
-#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
+#endif // LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index b7788e80eecac..8e6d51efad1f3 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -80,7 +80,6 @@ add_lldb_library(lldbTarget
   UnixSignals.cpp
   UnwindAssembly.cpp
   UnwindLLDB.cpp
-  VerboseTrapFrameRecognizer.cpp
 
   ADDITIONAL_HEADER_DIRS
     ${LLDB_INCLUDE_DIR}/lldb/Target
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index fb9e7eb5ed1bd..42ce198a283da 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -65,7 +65,6 @@
 #include "lldb/Target/ThreadPlanCallFunction.h"
 #include "lldb/Target/ThreadPlanStack.h"
 #include "lldb/Target/UnixSignals.h"
-#include "lldb/Target/VerboseTrapFrameRecognizer.h"
 #include "lldb/Utility/AddressableBits.h"
 #include "lldb/Utility/Event.h"
 #include "lldb/Utility/LLDBLog.h"
@@ -513,7 +512,6 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
   // We should have a plugin do the registration instead, for example, a
   // common C LanguageRuntime plugin.
   RegisterAssertFrameRecognizer(this);
-  RegisterVerboseTrapFrameRecognizer(*this);
 }
 
 Process::~Process() {
diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
new file mode 100644
index 0000000000000..83a829a8c2fdd
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
@@ -0,0 +1,4 @@
+int main() {
+  __builtin_verbose_trap("Foo", "Bar");
+  return 0;
+}
diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test
new file mode 100644
index 0000000000000..f6052bb57c940
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/registration-unique.test
@@ -0,0 +1,54 @@
+# Checks that the recognizers that should work across language runtimes
+# are only registered once with the target.
+
+# RUN: split-file %s %t
+
+# RUN: %clang_host %t/main.cpp -g -o %t/cpp.out
+# RUN: %lldb -b -s %t/commands.input %t/cpp.out | FileCheck %s
+
+# RUN: %clang_host -x objective-c++ %t/main.mm -g -o %t/objcxx.out
+# RUN: %lldb -b -s %t/commands.input %t/objcxx.out | FileCheck %s
+
+# RUN: %clang_host %t/main.c -g -o %t/c.out
+# RUN: %lldb -b -s %t/commands.input %t/c.out | FileCheck %s
+
+# RUN: %clang_host -x objective-c %t/main.m -g -o %t/objc.out
+# RUN: %lldb -b -s %t/commands.input %t/objc.out | FileCheck %s
+
+#--- main.m
+int main() {}
+
+#--- main.c
+int main() {}
+
+#--- main.mm
+int main() {}
+
+#--- main.cpp
+int main() {}
+
+#--- commands.input
+
+b main
+frame recognizer list
+run
+frame recognizer list
+continue
+run
+frame recognizer list
+
+# CHECK: frame recognizer list
+# CHECK-NEXT: no matching results found.
+
+# CHECK: frame recognizer list
+# CHECK: Verbose Trap StackFrame Recognizer
+# CHECK: Assert StackFrame Recognizer
+# CHECK-NOT: Verbose Trap StackFrame Recognizer
+# CHECK-NOT: Assert StackFrame Recognizer
+
+# FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341
+# CHECK: frame recognizer list
+# CHECK: Verbose Trap StackFrame Recognizer
+# CHECK: Assert StackFrame Recognizer
+# CHECK: Verbose Trap StackFrame Recognizer
+# CHECK: Assert StackFrame Recognizer
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-objc.test b/lldb/test/Shell/Recognizer/verbose_trap-objc.test
new file mode 100644
index 0000000000000..789e79c9542c5
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/verbose_trap-objc.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+#
+# RUN: %clangxx_host -x objective-c -g %S/Inputs/verbose_trap.m -o %t.out
+# RUN: %lldb -b -s %s %t.out | FileCheck %s
+
+run
+# CHECK: thread #{{.*}}stop reason = Foo: Bar
+frame info
+# CHECK: frame #{{.*}}`main at verbose_trap.m
+frame recognizer info 0
+# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer
+q

From 9ee9fb0d4090ffb3799ddb731394e403dbda2fb3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 4 Nov 2025 09:37:05 +0000
Subject: [PATCH 134/313] [X86] narrowBitOpRMW - add handling for single bit
 insertion patterns (REAPPLIED) (#166337)

Insertion of a single bit into a large integer is typically
canonicalized to "(X & ~(1 << ShAmt)) | (InsertBit << ShAmt)", which can
be simplified to modify the i32 block as a BTR followed by an
OR((i32)InsertBit << (ShAmt % 32).

We must ensure that the InsertBit is zero apart from the LSB so we can
cheaply truncate it to work with the i32 block like the simpler BT
patterns.

REAPPLIED from #165742 which was reverted as part of a chain of commits
due to a sanitizer regression that should have been fixed by #166160
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  43 +-
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 952 ++-----------------
 2 files changed, 114 insertions(+), 881 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b97b5089cb0a3..d4a4d4339f7e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53349,7 +53349,8 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
 }
 
 // Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
 static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                               SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
@@ -53375,14 +53376,20 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   // BTR: X & ~(1 << ShAmt)
   // BTS: X | (1 << ShAmt)
   // BTC: X ^ (1 << ShAmt)
-  SDValue ShAmt;
+  //
+  // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+  SDValue InsertBit, ShAmt;
   if (!StoredVal.hasOneUse() ||
       !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
                                   m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
         sd_match(StoredVal,
                  m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
         sd_match(StoredVal,
-                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+        sd_match(StoredVal,
+                 m_Or(m_And(m_Specific(LoadVal),
+                            m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+                      m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
     return SDValue();
 
   // Ensure the shift amount is in bounds.
@@ -53390,6 +53397,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
     return SDValue();
 
+  // If we're inserting a bit then it must be the LSB.
+  if (InsertBit) {
+    KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+    if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+      return SDValue();
+  }
+
   // Split the shift into an alignment shift that moves the active i32 block to
   // the bottom bits for truncation and a modulo shift that can act on the i32.
   EVT AmtVT = ShAmt.getValueType();
@@ -53397,6 +53411,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                                  DAG.getSignedConstant(-32LL, DL, AmtVT));
   SDValue ModuloAmt =
       DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+  ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
 
   // Compute the byte offset for the i32 block that is changed by the RMW.
   // combineTruncate will adjust the load for us in a similar way.
@@ -53411,13 +53426,23 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
   X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
 
-  SDValue Mask =
-      DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
-                  DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
-  if (StoredVal.getOpcode() == ISD::AND)
-    Mask = DAG.getNOT(DL, Mask, MVT::i32);
+  SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                             DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
+
+  SDValue Res;
+  if (InsertBit) {
+    SDValue BitMask =
+        DAG.getNode(ISD::SHL, DL, MVT::i32,
+                    DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+    Res =
+        DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+  } else {
+    if (StoredVal.getOpcode() == ISD::AND)
+      Mask = DAG.getNOT(DL, Mask, MVT::i32);
+    Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+  }
 
-  SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
   return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
                       Align(), St->getMemOperand()->getFlags());
 }
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c311ab869c311..87a54a0b9148d 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB9_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    notl %esi
-; X86-NEXT:    notl %edx
-; X86-NEXT:    je .LBB9_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    andl (%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $32, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl (%ebx,%eax), %eax
-; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    movl %edx, (%ebx)
-; X86-NEXT:    movl %esi, 4(%ebx)
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -600,208 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movzbl 16(%ebp), %ebx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 64(%esp,%eax), %edx
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-NEXT:    movl 76(%esp,%esi), %esi
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    notl %edi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl 36(%esp,%ecx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esp,%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    andl 8(%eax), %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl 44(%esp,%eax), %eax
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    andl 12(%ecx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl 32(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    andl (%eax), %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    andl 4(%ecx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    andl $96, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    movl (%ecx,%eax), %eax
-; X86-NEXT:    btl %esi, %eax
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %r9d, %r9d
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %r9, %rsi
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %r9, %rax
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq 8(%rdi), %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq (%rdi), %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    andl $96, %eax
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    movl (%rdi,%rax), %eax
-; SSE-NEXT:    btl %ecx, %eax
+; SSE-NEXT:    andl $96, %esi
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
 ; SSE-NEXT:    setae %al
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: init_eq_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %rax, %rsi
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    xorl %r8d, %r8d
-; AVX2-NEXT:    shldq %cl, %rdx, %r8
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    shlxq %rcx, %rax, %rax
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rax, %rsi
-; AVX2-NEXT:    cmovneq %r9, %rax
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT:    cmovneq %rdx, %r8
-; AVX2-NEXT:    cmovneq %r9, %rdx
-; AVX2-NEXT:    andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    andnq (%rdi), %rax, %r8
-; AVX2-NEXT:    orq %rdx, %r8
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    andl $96, %eax
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    movl (%rdi,%rax), %eax
-; AVX2-NEXT:    btl %ecx, %eax
-; AVX2-NEXT:    setae %al
-; AVX2-NEXT:    movq %r8, (%rdi)
-; AVX2-NEXT:    movq %rsi, 8(%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %rax, %rsi
-; AVX512-NEXT:    xorl %r8d, %r8d
-; AVX512-NEXT:    shlxq %rcx, %rax, %rax
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    xorl %r9d, %r9d
-; AVX512-NEXT:    shldq %cl, %rdx, %r9
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rax, %rsi
-; AVX512-NEXT:    cmovneq %r8, %rax
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT:    cmovneq %rdx, %r9
-; AVX512-NEXT:    cmovneq %r8, %rdx
-; AVX512-NEXT:    andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT:    orq %r9, %rsi
-; AVX512-NEXT:    andnq (%rdi), %rax, %r8
-; AVX512-NEXT:    orq %rdx, %r8
-; AVX512-NEXT:    movl %ecx, %eax
-; AVX512-NEXT:    andl $96, %eax
-; AVX512-NEXT:    shrl $3, %eax
-; AVX512-NEXT:    movl (%rdi,%rax), %eax
-; AVX512-NEXT:    btl %ecx, %eax
-; AVX512-NEXT:    setae %al
-; AVX512-NEXT:    movq %r8, (%rdi)
-; AVX512-NEXT:    movq %rsi, 8(%rdi)
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    andl $96, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -977,673 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $352, %esp # imm = 0x160
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%eax), %esi
-; X86-NEXT:    movl 60(%eax), %ebx
-; X86-NEXT:    movl 52(%eax), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%eax), %edi
-; X86-NEXT:    movl 44(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl 16(%ebp), %eax
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%eax), %esi
-; X86-NEXT:    movl 60(%eax), %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    andl 60(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 52(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 56(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 52(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 44(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 48(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 40(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 44(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 36(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 40(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 32(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 36(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 28(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 32(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 24(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 28(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 20(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 24(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 16(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 20(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 12(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 16(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 8(%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    andl 12(%edx), %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 4(%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    andl 8(%edx), %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    andl 4(%edx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    andl (%edx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%edx,%eax), %eax
-; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 60(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 56(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 52(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl %ebx, 8(%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
-; X86-NEXT:    movl %esi, (%edx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $168, %rsp
-; SSE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    movl %esi, %eax
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    andl $56, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    movslq %eax, %r12
-; SSE-NEXT:    movq 136(%rsp,%r12), %r9
-; SSE-NEXT:    movq 144(%rsp,%r12), %rax
-; SSE-NEXT:    movq %rax, %rsi
-; SSE-NEXT:    shldq %cl, %r9, %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 152(%rsp,%r12), %r11
-; SSE-NEXT:    shldq %cl, %rax, %r11
-; SSE-NEXT:    movq 120(%rsp,%r12), %r10
-; SSE-NEXT:    movq 128(%rsp,%r12), %rax
-; SSE-NEXT:    movq %rax, %rbx
-; SSE-NEXT:    shldq %cl, %r10, %rbx
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    movq 104(%rsp,%r12), %r14
-; SSE-NEXT:    movq 112(%rsp,%r12), %rax
-; SSE-NEXT:    movq %rax, %r15
-; SSE-NEXT:    shldq %cl, %r14, %r15
-; SSE-NEXT:    shldq %cl, %rax, %r10
-; SSE-NEXT:    movq 96(%rsp,%r12), %rax
-; SSE-NEXT:    movq %rax, %r13
-; SSE-NEXT:    shlq %cl, %r13
-; SSE-NEXT:    shldq %cl, %rax, %r14
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq 8(%rsp,%r12), %r8
-; SSE-NEXT:    movq 16(%rsp,%r12), %rsi
-; SSE-NEXT:    movq %rsi, %rbp
-; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    andq 48(%rdi), %rax
-; SSE-NEXT:    orq %rbp, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    notq %rbx
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    movq 24(%rsp,%r12), %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq -8(%rsp,%r12), %rbp
-; SSE-NEXT:    movq (%rsp,%r12), %rdx
-; SSE-NEXT:    movq %rdx, %rsi
-; SSE-NEXT:    shldq %cl, %rbp, %rsi
-; SSE-NEXT:    andq 56(%rdi), %r11
-; SSE-NEXT:    andq 32(%rdi), %rbx
-; SSE-NEXT:    orq %rax, %r11
-; SSE-NEXT:    orq %rsi, %rbx
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    shldq %cl, %rdx, %r8
-; SSE-NEXT:    notq %r9
-; SSE-NEXT:    andq 40(%rdi), %r9
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    movq -24(%rsp,%r12), %rax
-; SSE-NEXT:    movq -16(%rsp,%r12), %rdx
-; SSE-NEXT:    movq %rdx, %rsi
-; SSE-NEXT:    shldq %cl, %rax, %rsi
-; SSE-NEXT:    andq 16(%rdi), %r15
-; SSE-NEXT:    orq %rsi, %r15
-; SSE-NEXT:    shldq %cl, %rdx, %rbp
-; SSE-NEXT:    notq %r10
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    movq -32(%rsp,%r12), %rdx
-; SSE-NEXT:    movq %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    andq 24(%rdi), %r10
-; SSE-NEXT:    andq (%rdi), %r13
-; SSE-NEXT:    orq %rbp, %r10
-; SSE-NEXT:    orq %rsi, %r13
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    andq 8(%rdi), %r14
-; SSE-NEXT:    orq %rax, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andl $60, %eax
-; SSE-NEXT:    movl (%rdi,%rax), %eax
-; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; SSE-NEXT:    btl %ecx, %eax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq %rax, 48(%rdi)
-; SSE-NEXT:    movq %r11, 56(%rdi)
-; SSE-NEXT:    movq %rbx, 32(%rdi)
-; SSE-NEXT:    movq %r9, 40(%rdi)
-; SSE-NEXT:    movq %r15, 16(%rdi)
-; SSE-NEXT:    movq %r10, 24(%rdi)
-; SSE-NEXT:    movq %r13, (%rdi)
-; SSE-NEXT:    movq %r14, 8(%rdi)
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $60, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
 ; SSE-NEXT:    setae %al
-; SSE-NEXT:    addq $168, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: init_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $184, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    movl %esi, %ebx
-; AVX2-NEXT:    shrl $3, %ebx
-; AVX2-NEXT:    movl %ebx, %eax
-; AVX2-NEXT:    andl $56, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    movslq %eax, %r11
-; AVX2-NEXT:    movq 128(%rsp,%r11), %r15
-; AVX2-NEXT:    movq 136(%rsp,%r11), %rax
-; AVX2-NEXT:    movq %rax, %rsi
-; AVX2-NEXT:    shldq %cl, %r15, %rsi
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 120(%rsp,%r11), %r8
-; AVX2-NEXT:    shldq %cl, %r8, %r15
-; AVX2-NEXT:    movq 144(%rsp,%r11), %r14
-; AVX2-NEXT:    movq 152(%rsp,%r11), %rsi
-; AVX2-NEXT:    movq %rsi, %r9
-; AVX2-NEXT:    shldq %cl, %r14, %r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %r14
-; AVX2-NEXT:    movq 112(%rsp,%r11), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 160(%rsp,%r11), %r13
-; AVX2-NEXT:    movq 168(%rsp,%r11), %r12
-; AVX2-NEXT:    shldq %cl, %r13, %r12
-; AVX2-NEXT:    shldq %cl, %rsi, %r13
-; AVX2-NEXT:    shldq %cl, %rax, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl %edx, %eax
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq 24(%rsp,%r11), %rbp
-; AVX2-NEXT:    movq 32(%rsp,%r11), %rdx
-; AVX2-NEXT:    movq %rdx, %rax
-; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq 40(%rsp,%r11), %r10
-; AVX2-NEXT:    shldq %cl, %rdx, %r10
-; AVX2-NEXT:    movq 8(%rsp,%r11), %r9
-; AVX2-NEXT:    movq 16(%rsp,%r11), %rdx
-; AVX2-NEXT:    movq %rdx, %r8
-; AVX2-NEXT:    shldq %cl, %r9, %r8
-; AVX2-NEXT:    shldq %cl, %rdx, %rbp
-; AVX2-NEXT:    andnq 48(%rdi), %r13, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq -8(%rsp,%r11), %rax
-; AVX2-NEXT:    movq (%rsp,%r11), %rdx
-; AVX2-NEXT:    movq %rdx, %rsi
-; AVX2-NEXT:    shldq %cl, %rax, %rsi
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    andnq 56(%rdi), %r12, %r12
-; AVX2-NEXT:    andnq 32(%rdi), %r14, %r14
-; AVX2-NEXT:    orq %r10, %r12
-; AVX2-NEXT:    orq %r8, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT:    andnq 40(%rdi), %rdx, %rdx
-; AVX2-NEXT:    orq %rbp, %rdx
-; AVX2-NEXT:    shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    movq -16(%rsp,%r11), %r10
-; AVX2-NEXT:    shlxq %rcx, %r10, %r11
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %r10, %rax
-; AVX2-NEXT:    andnq 16(%rdi), %r15, %rcx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andnq 24(%rdi), %r10, %r10
-; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    orq %r9, %r10
-; AVX2-NEXT:    andnq (%rdi), %r8, %rsi
-; AVX2-NEXT:    orq %r11, %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andnq 8(%rdi), %r8, %r8
-; AVX2-NEXT:    orq %rax, %r8
-; AVX2-NEXT:    andl $60, %ebx
-; AVX2-NEXT:    movl (%rdi,%rbx), %eax
-; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX2-NEXT:    btl %r9d, %eax
-; AVX2-NEXT:    movq %r13, 48(%rdi)
-; AVX2-NEXT:    movq %r12, 56(%rdi)
-; AVX2-NEXT:    movq %r14, 32(%rdi)
-; AVX2-NEXT:    movq %rdx, 40(%rdi)
-; AVX2-NEXT:    movq %rcx, 16(%rdi)
-; AVX2-NEXT:    movq %r10, 24(%rdi)
-; AVX2-NEXT:    movq %rsi, (%rdi)
-; AVX2-NEXT:    movq %r8, 8(%rdi)
-; AVX2-NEXT:    setae %al
-; AVX2-NEXT:    addq $184, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $168, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    movl %esi, %r10d
-; AVX512-NEXT:    shrl $3, %r10d
-; AVX512-NEXT:    movl %r10d, %r8d
-; AVX512-NEXT:    andl $56, %r8d
-; AVX512-NEXT:    negl %r8d
-; AVX512-NEXT:    movslq %r8d, %r9
-; AVX512-NEXT:    movq 112(%rsp,%r9), %r11
-; AVX512-NEXT:    movq 120(%rsp,%r9), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %r11, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 104(%rsp,%r9), %rax
-; AVX512-NEXT:    shldq %cl, %rax, %r11
-; AVX512-NEXT:    movq 128(%rsp,%r9), %r15
-; AVX512-NEXT:    movq 136(%rsp,%r9), %rbp
-; AVX512-NEXT:    movq %rbp, %rbx
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    shldq %cl, %r14, %r15
-; AVX512-NEXT:    movq 144(%rsp,%r9), %r13
-; AVX512-NEXT:    movq 152(%rsp,%r9), %r12
-; AVX512-NEXT:    shldq %cl, %r13, %r12
-; AVX512-NEXT:    movq 96(%rsp,%r9), %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r13
-; AVX512-NEXT:    shldq %cl, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq 8(%rsp,%r9), %r8
-; AVX512-NEXT:    movq 16(%rsp,%r9), %rax
-; AVX512-NEXT:    movq %rax, %rbp
-; AVX512-NEXT:    shldq %cl, %r8, %rbp
-; AVX512-NEXT:    andnq 48(%rdi), %r13, %r13
-; AVX512-NEXT:    orq %rbp, %r13
-; AVX512-NEXT:    movq 24(%rsp,%r9), %rbp
-; AVX512-NEXT:    shldq %cl, %rax, %rbp
-; AVX512-NEXT:    movq -8(%rsp,%r9), %rax
-; AVX512-NEXT:    movq (%rsp,%r9), %rsi
-; AVX512-NEXT:    movq %rsi, %rdx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    andnq 56(%rdi), %r12, %r12
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    andnq 32(%rdi), %r15, %r15
-; AVX512-NEXT:    orq %rdx, %r15
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    movq -24(%rsp,%r9), %rdx
-; AVX512-NEXT:    movq -16(%rsp,%r9), %rsi
-; AVX512-NEXT:    movq %rsi, %rbp
-; AVX512-NEXT:    shldq %cl, %rdx, %rbp
-; AVX512-NEXT:    andnq 40(%rdi), %rbx, %rbx
-; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    andnq 16(%rdi), %r11, %r8
-; AVX512-NEXT:    orq %rbp, %r8
-; AVX512-NEXT:    shlxq %rcx, %r14, %r11
-; AVX512-NEXT:    movq -32(%rsp,%r9), %r9
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    andnq 24(%rdi), %rsi, %rsi
-; AVX512-NEXT:    orq %rax, %rsi
-; AVX512-NEXT:    shlxq %rcx, %r9, %rax
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %r9, %rdx
-; AVX512-NEXT:    andnq (%rdi), %r11, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andnq 8(%rdi), %rax, %rax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    andl $60, %r10d
-; AVX512-NEXT:    movl (%rdi,%r10), %edx
-; AVX512-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX512-NEXT:    btl %r9d, %edx
-; AVX512-NEXT:    movq %r13, 48(%rdi)
-; AVX512-NEXT:    movq %r12, 56(%rdi)
-; AVX512-NEXT:    movq %r15, 32(%rdi)
-; AVX512-NEXT:    movq %rbx, 40(%rdi)
-; AVX512-NEXT:    movq %r8, 16(%rdi)
-; AVX512-NEXT:    movq %rsi, 24(%rdi)
-; AVX512-NEXT:    movq %rcx, (%rdi)
-; AVX512-NEXT:    movq %rax, 8(%rdi)
-; AVX512-NEXT:    setae %al
-; AVX512-NEXT:    addq $168, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    andl $60, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs

From 84cc2b0ebc2b28e0474c9aa46264f4c20adcc959 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Tue, 4 Nov 2025 18:37:58 +0900
Subject: [PATCH 135/313] [LIT] replace `lit.util.mkdir` with
 `pathlib.Path.mkdir` (#163948)

`lit.util.mkdir` and `lit.util.mkdir_p` were written during the Python
2.x era.
Since modern `pathlib` functions have similar functionality, we can
simply use those instead.

If you encounter a path length issue after this change, the registry
value `LongPathsEnabled` must be set as described in
https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
. Note that the Python runtime is already marked as a `longPathAware`
executable.

**Background:**

On Cygwin, a file named `file_name.exe` can be accessed without the
suffix, simply as `file_name`, as shown below:

```
$ echo > file_name.exe

$ file file_name.exe
file_name.exe: very short file (no magic)

$ file file_name
file_name: very short file (no magic)
```

In this situation, while running `mkdir file_name` works as intended,
checking for the existence of the target before calling `mkdir`
incorrectly reports that it already exists and thus skips the directory
creation.

```
$ test -e file_name && echo exists
exists

$ mkdir file_name && echo ok
ok

$ file file_name
file_name: directory
```

Therefore, the existence pre-check should be skipped on Cygwin. Instead
of add a workaround, refactored them.
---
 llvm/utils/lit/lit/TestRunner.py              | 15 ++++---
 llvm/utils/lit/lit/util.py                    | 40 +++----------------
 .../shtest-glob/example_dir1.input/empty      |  0
 .../shtest-glob/example_dir2.input/empty      |  0
 .../tests/Inputs/shtest-glob/glob-mkdir.txt   |  5 +++
 llvm/utils/lit/tests/shtest-glob.py           |  9 +++--
 6 files changed, 22 insertions(+), 47 deletions(-)
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty

diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 3176b1a257434..64148c6098327 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -462,16 +462,15 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
     stderr = StringIO()
     exitCode = 0
     for dir in args:
-        cwd = cmd_shenv.cwd
-        dir = to_unicode(dir) if kIsWindows else to_bytes(dir)
-        cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
-        if not os.path.isabs(dir):
-            dir = lit.util.abs_path_preserve_drive(os.path.join(cwd, dir))
+        dir = pathlib.Path(dir)
+        cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
+        if not dir.is_absolute():
+            dir = lit.util.abs_path_preserve_drive(cwd / dir)
         if parent:
-            lit.util.mkdir_p(dir)
+            dir.mkdir(parents=True, exist_ok=True)
         else:
             try:
-                lit.util.mkdir(dir)
+                dir.mkdir(exist_ok=True)
             except OSError as err:
                 stderr.write("Error: 'mkdir' command failed, %s\n" % str(err))
                 exitCode = 1
@@ -2411,7 +2410,7 @@ def runOnce(
         return out, err, exitCode, timeoutInfo, status
 
     # Create the output directory if it does not already exist.
-    lit.util.mkdir_p(os.path.dirname(tmpBase))
+    pathlib.Path(tmpBase).parent.mkdir(parents=True, exist_ok=True)
 
     # Re-run failed tests up to test.allowed_retries times.
     execdir = os.path.dirname(test.getExecPath())
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index ce4c3c2df3436..e4e031b3e0898 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -5,6 +5,7 @@
 import math
 import numbers
 import os
+import pathlib
 import platform
 import re
 import signal
@@ -131,48 +132,17 @@ def abs_path_preserve_drive(path):
         # Since Python 3.8, os.path.realpath resolves sustitute drives,
         # so we should not use it. In Python 3.7, os.path.realpath
         # was implemented as os.path.abspath.
+        if isinstance(path, pathlib.Path):
+            return path.absolute()
         return os.path.abspath(path)
     else:
         # On UNIX, the current directory always has symbolic links resolved,
         # so any program accepting relative paths cannot preserve symbolic
         # links in paths and we should always use os.path.realpath.
+        if isinstance(path, pathlib.Path):
+            return path.resolve()
         return os.path.realpath(path)
 
-def mkdir(path):
-    try:
-        if platform.system() == "Windows":
-            from ctypes import windll
-            from ctypes import GetLastError, WinError
-
-            path = os.path.abspath(path)
-            # Make sure that the path uses backslashes here, in case
-            # python would have happened to use forward slashes, as the
-            # NT path format only supports backslashes.
-            path = path.replace("/", "\\")
-            NTPath = to_unicode(r"\\?\%s" % path)
-            if not windll.kernel32.CreateDirectoryW(NTPath, None):
-                raise WinError(GetLastError())
-        else:
-            os.mkdir(path)
-    except OSError:
-        e = sys.exc_info()[1]
-        # ignore EEXIST, which may occur during a race condition
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def mkdir_p(path):
-    """mkdir_p(path) - Make the "path" directory, if it does not exist; this
-    will also make directories for any missing parent directories."""
-    if not path or os.path.exists(path):
-        return
-
-    parent = os.path.dirname(path)
-    if parent != path:
-        mkdir_p(parent)
-
-    mkdir(path)
-
 
 def listdir_files(dirname, suffixes=None, exclude_filenames=None, prefixes=None):
     """Yields files in a directory.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
index d1329f5dbfaae..71972411bc4cf 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
@@ -1,2 +1,7 @@
 ## Tests glob pattern handling in the mkdir command.
+
+## This mkdir should fail because the `example_file*.input`s are regular files.
 # RUN: not mkdir %S/example_file*.input
+
+## This mkdir should succeed (so RUN should fail) because the `example_dir*.input`s that already exist are directories.
+# RUN: not mkdir %S/example_dir*.input
diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py
index ae90f31907d49..aa4705b634a7d 100644
--- a/llvm/utils/lit/tests/shtest-glob.py
+++ b/llvm/utils/lit/tests/shtest-glob.py
@@ -1,12 +1,13 @@
 ## Tests glob pattern handling in echo command.
 
 # RUN: not %{lit} -a -v %{inputs}/shtest-glob \
-# RUN: | FileCheck -dump-input=fail -match-full-lines %s
-#
+# RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s
 # END.
 
 # CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}})
 # CHECK: TypeError: string argument expected, got 'GlobItem'
 
-# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
-# CHECK: # error: command failed with exit status: 1
+# CHECK:      FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
+# CHECK:      # | Error: 'mkdir' command failed, {{.*}}example_file1.input'
+# CHECK-NEXT: # | Error: 'mkdir' command failed, {{.*}}example_file2.input'
+# CHECK:      # error: command failed with exit status: 1

From dec6e7380c691af755efe3f80e53590b743c497c Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Tue, 4 Nov 2025 18:38:20 +0900
Subject: [PATCH 136/313] [LIT][Cygwin] Mark `ulimit -f` unsupported for Cygwin
 (#165849)

Cygwin doesn't support `ulimit -f` because Windows doesn't provide such
functionality.
---
 llvm/utils/lit/tests/shtest-ulimit-nondarwin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
index 286fd3d7e173e..d81cde0159792 100644
--- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -2,7 +2,7 @@
 
 # ulimit does not work on non-POSIX platforms.
 # These tests are specific to options that Darwin does not support.
-# UNSUPPORTED: system-windows, system-darwin, system-aix, system-solaris
+# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris
 
 # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
 

From 73e3d788906a6e82394ac41d5c9baf918740bb0c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 4 Nov 2025 10:00:33 +0000
Subject: [PATCH 137/313] [flang][test] Fix gcc-triple test when default target
 is not x86_64

By adding --target to the first command.

Test added by #165886.
---
 flang/test/Driver/gcc-triple.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90
index 027d78a7c5046..324311febf157 100644
--- a/flang/test/Driver/gcc-triple.f90
+++ b/flang/test/Driver/gcc-triple.f90
@@ -2,7 +2,7 @@
 
 !! Test that --gcc-triple option is working as expected.
 
-! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE
+! RUN: %flang --target=x86_64-linux-gnu -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE
 ! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
 ! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
 ! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:

From 0cae0af520f7f3c60367ea4a2b38a32a35fa6c27 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 4 Nov 2025 10:04:57 +0000
Subject: [PATCH 138/313] [VPlan] Shorten insert-idiom in sinkScalarOperands
 (NFC) (#166343)

To follow-up on a post-commit review.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 39 +++++++++----------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14530539..e8e87aa8a4f3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -154,27 +154,32 @@ static bool sinkScalarOperands(VPlan &Plan) {
   bool ScalarVFOnly = Plan.hasScalarVFOnly();
   bool Changed = false;
 
-  auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo,
-                                             VPSingleDefRecipe *Candidate) {
-    // We only know how to duplicate VPReplicateRecipes and
-    // VPScalarIVStepsRecipes for now.
+  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+  auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
+                                        VPBasicBlock *SinkTo, VPValue *Op) {
+    auto *Candidate =
+        dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
+    if (!Candidate)
+      return;
+
+    // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
+    // for now.
     if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
-      return false;
+      return;
 
     if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
         Candidate->mayReadOrWriteMemory())
-      return false;
+      return;
 
     if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
       if (!ScalarVFOnly && RepR->isSingleScalar())
-        return false;
+        return;
 
-    return true;
+    WorkList.insert({SinkTo, Candidate});
   };
 
   // First, collect the operands of all recipes in replicate blocks as seeds for
   // sinking.
-  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -182,14 +187,9 @@ static bool sinkScalarOperands(VPlan &Plan) {
     VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
     if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
       continue;
-    for (auto &Recipe : *VPBB) {
-      for (VPValue *Op : Recipe.operands()) {
-        if (auto *Def =
-                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-          if (IsValidSinkCandidate(VPBB, Def))
-            WorkList.insert({VPBB, Def});
-      }
-    }
+    for (auto &Recipe : *VPBB)
+      for (VPValue *Op : Recipe.operands())
+        InsertIfValidSinkCandidate(VPBB, Op);
   }
 
   // Try to sink each replicate or scalar IV steps recipe in the worklist.
@@ -234,10 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     }
     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
     for (VPValue *Op : SinkCandidate->operands())
-      if (auto *Def =
-              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-        if (IsValidSinkCandidate(SinkTo, Def))
-          WorkList.insert({SinkTo, Def});
+      InsertIfValidSinkCandidate(SinkTo, Op);
     Changed = true;
   }
   return Changed;

From ab487b6378cfd2f59b25eb945f07c5074b5217b8 Mon Sep 17 00:00:00 2001
From: Andrew Ng <andrew.ng@sony.com>
Date: Tue, 4 Nov 2025 10:07:54 +0000
Subject: [PATCH 139/313] [BitcodeReader][NFC] Tidy
 getEnableSplitLTOUnitAndUnifiedFlag (#165732)

---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 34 ++++++-----------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 466dcb02696f4..1bdf3e9f684f5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8566,16 +8566,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
 }
 
 static Expected<std::pair<bool, bool>>
-getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
-                                                 unsigned ID,
-                                                 BitcodeLTOInfo &LTOInfo) {
+getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
   if (Error Err = Stream.EnterSubBlock(ID))
     return std::move(Err);
-  SmallVector<uint64_t, 64> Record;
 
+  SmallVector<uint64_t, 64> Record;
   while (true) {
     BitstreamEntry Entry;
-    std::pair<bool, bool> Result = {false,false};
     if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
       return std::move(E);
 
@@ -8584,8 +8581,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock: {
-      // If no flags record found, set both flags to false.
-      return Result;
+      // If no flags record found, return both flags as false.
+      return std::make_pair(false, false);
     }
     case BitstreamEntry::Record:
       // The interesting case.
@@ -8607,9 +8604,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
 
       bool EnableSplitLTOUnit = Flags & 0x8;
       bool UnifiedLTO = Flags & 0x200;
-      Result = {EnableSplitLTOUnit, UnifiedLTO};
-
-      return Result;
+      return std::make_pair(EnableSplitLTOUnit, UnifiedLTO);
     }
     }
   }
@@ -8638,26 +8633,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() {
                             /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false};
 
     case BitstreamEntry::SubBlock:
-      if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) {
-        BitcodeLTOInfo LTOInfo;
+      if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID ||
+          Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
         Expected<std::pair<bool, bool>> Flags =
-            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
+            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID);
         if (!Flags)
           return Flags.takeError();
-        std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
-        LTOInfo.IsThinLTO = true;
-        LTOInfo.HasSummary = true;
-        return LTOInfo;
-      }
-
-      if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
         BitcodeLTOInfo LTOInfo;
-        Expected<std::pair<bool, bool>> Flags =
-            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
-        if (!Flags)
-          return Flags.takeError();
         std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
-        LTOInfo.IsThinLTO = false;
+        LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID);
         LTOInfo.HasSummary = true;
         return LTOInfo;
       }

From 97947f150f55727c5ae0998218260561e28b946e Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar@nvidia.com>
Date: Tue, 4 Nov 2025 15:38:59 +0530
Subject: [PATCH 140/313] [MLIR][NVVM] Update Op verifiers to prevent
 ungraceful exits (#165677)

Updates the following Ops to prevent ungraceful exits with a stack-dump
in certain cases of incorrect usages, and instead gracefully error out
with a more informative error message:

- `tcgen05.ld`
- `shfl.sync`
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  9 +++-
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    | 46 +++++++++++++++----
 mlir/test/Dialect/LLVMIR/invalid.mlir         |  6 +--
 .../Target/LLVMIR/nvvm/shfl-sync-invalid.mlir | 22 +++++++++
 .../LLVMIR/nvvm/tcgen05-ld-invalid.mlir       |  9 ++++
 5 files changed, 78 insertions(+), 14 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
 create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 9be108d5d1056..5f87e5c07e56e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1328,9 +1328,9 @@ def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">;
 
 def NVVM_ShflOp :
   NVVM_Op<"shfl.sync", [NVVMRequiresSM<30>]>,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs AnyTypeOf<[I32, F32, LLVMStructType]>:$res)>,
   Arguments<(ins I32:$thread_mask,
-                 LLVM_Type:$val,
+                 AnyTypeOf<[I32, F32]>:$val,
                  I32:$offset,
                  I32:$mask_and_clamp,
                  ShflKindAttr:$kind,
@@ -1346,6 +1346,11 @@ def NVVM_ShflOp :
     a mask for logically splitting warps into sub-segments and an upper bound
     for clamping the source lane index.
     
+    The `return_value_and_is_valid` unit attribute can be specified to indicate 
+    that the return value is a two-element struct, where the first element is 
+    the result value and the second element is a predicate indicating if the 
+    computed source lane index is valid.
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync)
   }];
   string llvmBuilder = [{
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index f2e55f255ceac..262d9b753a2d7 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -920,15 +920,40 @@ LogicalResult MmaOp::verify() {
 }
 
 LogicalResult ShflOp::verify() {
-  if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid"))
-    return success();
-  auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
-  auto elementType = (type && type.getBody().size() == 2)
-                         ? llvm::dyn_cast<IntegerType>(type.getBody()[1])
-                         : nullptr;
-  if (!elementType || elementType.getWidth() != 1)
-    return emitError("expected return type to be a two-element struct with "
-                     "i1 as the second element");
+  auto returnStructType = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
+
+  auto verifyTypeError = [&](Twine desc, Type expectedType,
+                             Type actualType) -> LogicalResult {
+    return emitOpError("expected " + desc + " to be of type ")
+           << expectedType << " but got " << actualType << " instead";
+  };
+
+  if (returnStructType) {
+    if (!getReturnValueAndIsValid())
+      return emitOpError("\"return_value_and_is_valid\" attribute must be "
+                         "specified when the return type is a struct type");
+
+    if (returnStructType.getBody().size() != 2)
+      return emitOpError("expected return type to be a two-element struct");
+
+    llvm::ArrayRef<Type> returnStruct = returnStructType.getBody();
+    auto resultType = returnStruct[0];
+    if (resultType != getVal().getType())
+      return verifyTypeError("first element in the returned struct",
+                             getVal().getType(), resultType);
+
+    auto predicateType = returnStruct[1];
+    if (!predicateType.isInteger(1))
+      return verifyTypeError("second element in the returned struct",
+                             mlir::IntegerType::get(getContext(), 1),
+                             predicateType);
+  } else {
+    if (getReturnValueAndIsValid())
+      return emitOpError("expected return type to be a two-element struct");
+
+    if (getType() != getVal().getType())
+      return verifyTypeError("return type", getVal().getType(), getType());
+  }
   return success();
 }
 
@@ -2677,6 +2702,9 @@ LogicalResult Tcgen05LdOp::verify() {
   if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset())
     result = emitError("shape 16x32bx2 requires offset argument");
 
+  if (getShape() != NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && getOffset())
+    result = emitError("offset argument is only supported for shape 16x32bx2");
+
   auto resTy = getRes().getType();
   unsigned resLen = isa<VectorType>(resTy)
                         ? llvm::cast<VectorType>(resTy).getNumElements()
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index aaf9f8024bfbe..49b6342aea538 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -664,21 +664,21 @@ func.func @zero_non_llvm_type() {
 // -----
 
 func.func @nvvm_invalid_shfl_pred_1(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected return type to be a two-element struct}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32
 }
 
 // -----
 
 func.func @nvvm_invalid_shfl_pred_2(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected return type to be a two-element struct}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)>
 }
 
 // -----
 
 func.func @nvvm_invalid_shfl_pred_3(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected second element in the returned struct to be of type 'i1' but got 'i32' instead}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)>
 }
 
diff --git a/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
new file mode 100644
index 0000000000000..f2ccfe71a3f23
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+func.func @nvvm_invalid_shfl_pred(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{"return_value_and_is_valid" attribute must be specified when the return type is a struct type}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> !llvm.struct<(f32, i1)>
+}
+
+// -----
+
+func.func @nvvm_invalid_shfl_invalid_return_type_1(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{expected return type to be of type 'f32' but got 'i32' instead}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> i32
+}
+
+// -----
+
+func.func @nvvm_invalid_shfl_invalid_return_type_2(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{expected first element in the returned struct to be of type 'f32' but got 'i32' instead}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 {return_value_and_is_valid} : f32 -> !llvm.struct<(i32, i1)>
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir
new file mode 100644
index 0000000000000..1b93f20c15b99
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_tcgen05_ld_32x32b_offset(%tmemAddr : !llvm.ptr<6>, %offset : i64) -> () {
+  // expected-error@+1 {{offset argument is only supported for shape 16x32bx2}}
+  %ldv2 = nvvm.tcgen05.ld %tmemAddr, %offset { pack, shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>
+  llvm.return
+}

From f7be258c28939b25f5f9ec07174ace0d81ca3eea Mon Sep 17 00:00:00 2001
From: Jinjie Huang <huangjinjie@bytedance.com>
Date: Tue, 4 Nov 2025 18:27:53 +0800
Subject: [PATCH 141/313] [BOLT][NFC] Clean up the outdated option --write-dwp
 in doc (#166150)

Since the "--write-dwp" option has been removed in
[PR](https://github.com/llvm/llvm-project/pull/100771), this patch also
cleans up the corresponding document and test to avoid misleading
issues.
---
 bolt/docs/CommandLineArgumentReference.md             | 5 -----
 bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test | 5 +++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 43ceceee7de45..7c6e01d669b74 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -381,11 +381,6 @@
 
   Set verbosity level for diagnostic output
 
-- `--write-dwp`
-
-  Output a single dwarf package file (dwp) instead of multiple non-relocatable
-  dwarf object files (dwo).
-
 ### BOLT optimization options:
 
 - `--align-blocks`
diff --git a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
index 673e86bb1533a..a08e352d605fe 100644
--- a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
+++ b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
@@ -1,4 +1,4 @@
-# UNSUPPORTED: true
+# REQUIRES: system-linux
 ; RUN: rm -rf %t
 ; RUN: mkdir %t
 ; RUN: cd %t
@@ -8,7 +8,8 @@
 ; RUN: llvm-dwp -e main.exe -o main.exe.dwp
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.dwp | FileCheck -check-prefix=PRE-BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.dwp | FileCheck -check-prefix=PRE-BOLT-DWP-TU-INDEX %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwp -e main.exe.bolt -o main.exe.bolt.dwp
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
 

From 5b2f9b53bdb348393bba221c5f69bfac179092c8 Mon Sep 17 00:00:00 2001
From: kper <kevin.per@protonmail.com>
Date: Tue, 4 Nov 2025 11:35:40 +0100
Subject: [PATCH 142/313] [SimplifyCFG]: Switch on umin replaces default
 (#164097)

A switch on `umin` can eliminate the default case by making the `umin`'s
constant the default case.

Proof: https://alive2.llvm.org/ce/z/_N6nfs
Fixes: https://github.com/llvm/llvm-project/issues/162111
---
 llvm/include/llvm/IR/Instructions.h           |   5 +
 llvm/lib/IR/Instructions.cpp                  |  10 +
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  78 ++++++
 .../Transforms/SimplifyCFG/switch-umin.ll     | 246 ++++++++++++++++++
 4 files changed, 339 insertions(+)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/switch-umin.ll

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 27930bbc651bd..8bd060ae8f485 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -3556,6 +3556,11 @@ class SwitchInstProfUpdateWrapper {
   /// correspondent branch weight.
   LLVM_ABI SwitchInst::CaseIt removeCase(SwitchInst::CaseIt I);
 
+  /// Replace the default destination by given case. Delegate the call to
+  /// the underlying SwitchInst::setDefaultDest and remove correspondent branch
+  /// weight.
+  LLVM_ABI void replaceDefaultDest(SwitchInst::CaseIt I);
+
   /// Delegate the call to the underlying SwitchInst::addCase() and set the
   /// specified branch weight for the added case.
   LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest, CaseWeightOpt W);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 3b8fde8aff45f..cd39970f5111f 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) {
   return SI.removeCase(I);
 }
 
+void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) {
+  auto *DestBlock = I->getCaseSuccessor();
+  if (Weights) {
+    auto Weight = getSuccessorWeight(I->getCaseIndex() + 1);
+    (*Weights)[0] = Weight.value();
+  }
+
+  SI.setDefaultDest(DestBlock);
+}
+
 void SwitchInstProfUpdateWrapper::addCase(
     ConstantInt *OnVal, BasicBlock *Dest,
     SwitchInstProfUpdateWrapper::CaseWeightOpt W) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index cbc604e87cf1a..bb733277e0fad 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7570,6 +7570,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   return true;
 }
 
+/// Tries to transform the switch when the condition is umin with a constant.
+/// In that case, the default branch can be replaced by the constant's branch.
+/// This method also removes dead cases when the simplification cannot replace
+/// the default branch.
+///
+/// For example:
+/// switch(umin(a, 3)) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// case 3:
+/// case 4:
+///   // ...
+/// default:
+///   unreachable
+/// }
+///
+/// Transforms into:
+///
+/// switch(a) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// default:
+///   // This is case 3
+/// }
+static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) {
+  Value *A;
+  ConstantInt *Constant;
+
+  if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant))))
+    return false;
+
+  SmallVector<DominatorTree::UpdateType> Updates;
+  SwitchInstProfUpdateWrapper SIW(*SI);
+  BasicBlock *BB = SIW->getParent();
+
+  // Dead cases are removed even when the simplification fails.
+  // A case is dead when its value is higher than the Constant.
+  for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) {
+    if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) {
+      ++I;
+      continue;
+    }
+    BasicBlock *DeadCaseBB = I->getCaseSuccessor();
+    DeadCaseBB->removePredecessor(BB);
+    Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB});
+    I = SIW->removeCase(I);
+    E = SIW->case_end();
+  }
+
+  auto Case = SI->findCaseValue(Constant);
+  // If the case value is not found, `findCaseValue` returns the default case.
+  // In this scenario, since there is no explicit `case 3:`, the simplification
+  // fails. The simplification also fails when the switch’s default destination
+  // is reachable.
+  if (!SI->defaultDestUnreachable() || Case == SI->case_default()) {
+    if (DTU)
+      DTU->applyUpdates(Updates);
+    return !Updates.empty();
+  }
+
+  BasicBlock *Unreachable = SI->getDefaultDest();
+  SIW.replaceDefaultDest(Case);
+  SIW.removeCase(Case);
+  SIW->setCondition(A);
+
+  Updates.push_back({DominatorTree::Delete, BB, Unreachable});
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
+  return true;
+}
+
 /// Tries to transform switch of powers of two to reduce switch range.
 /// For example, switch like:
 /// switch (C) { case 1: case 2: case 64: case 128: }
@@ -8037,6 +8112,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (simplifyDuplicateSwitchArms(SI, DTU))
     return requestResimplify();
 
+  if (simplifySwitchWhenUMin(SI, DTU))
+    return requestResimplify();
+
   return false;
 }
 
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-umin.ll b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll
new file mode 100644
index 0000000000000..44665365dc222
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+
+define void @switch_replace_default(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ], !prof !0
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @switch_replace_default_and_remove_dead_cases(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default_and_remove_dead_cases(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 4, label %case4
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case4:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @switch_replace_default_when_holes(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default_when_holes(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @do_not_switch_replace_default(i32 %x, i32 %y) {
+; CHECK-LABEL: define void @do_not_switch_replace_default(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    switch i32 [[MIN]], label %[[UNREACHABLE:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 3, label %[[COMMON_RET:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[UNREACHABLE]]:
+; CHECK-NEXT:    unreachable
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 %y)
+  switch i32 %min, label %unreachable [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @do_not_replace_switch_default_but_remove_dead_cases(i32 %x) {
+; CHECK-LABEL: define void @do_not_replace_switch_default_but_remove_dead_cases(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[MIN]], label %[[CASE0:.*]] [
+; CHECK-NEXT:      i32 3, label %[[COMMON_RET:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %case0 [ ; default is reachable, therefore simplification not triggered
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  i32 4, label %case4
+  ]
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+case4:
+  call void @d()
+  ret void
+
+}
+
+
+!0 = !{!"branch_weights", i32 1, i32 2, i32 3, i32 99, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 2, i32 3, i32 99}
+;.

From c4ac31dc52e016a413ac9e2fa98252c6ff643e3c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 4 Nov 2025 10:48:49 +0000
Subject: [PATCH 143/313] [X86] narrowBitOpRMW - use
 reachesChainWithoutSideEffects instead of direct chain matching (#165870)

This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  40 +++--
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 152 ++++---------------
 2 files changed, 46 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d4a4d4339f7e1..6edf0185df813 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53355,21 +53355,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                               SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
-
-  // Only handle normal stores and its chain was a matching normal load.
-  auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
-  if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
-      !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
-      Ld->getBasePtr() != St->getBasePtr() ||
-      Ld->getOffset() != St->getOffset())
-    return SDValue();
-
-  SDValue LoadVal(Ld, 0);
   SDValue StoredVal = St->getValue();
   EVT VT = StoredVal.getValueType();
 
-  // Only narrow larger than legal scalar integers.
-  if (!VT.isScalarInteger() ||
+  // Only narrow normal stores of larger than legal scalar integers.
+  if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
       VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
     return SDValue();
 
@@ -53378,18 +53368,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   // BTC: X ^ (1 << ShAmt)
   //
   // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
-  SDValue InsertBit, ShAmt;
+  SDValue SrcVal, InsertBit, ShAmt;
   if (!StoredVal.hasOneUse() ||
-      !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+      !(sd_match(StoredVal, m_And(m_Value(SrcVal),
                                   m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
         sd_match(StoredVal,
-                 m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+                 m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
         sd_match(StoredVal,
-                 m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
-        sd_match(StoredVal,
-                 m_Or(m_And(m_Specific(LoadVal),
-                            m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
-                      m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+                 m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+        sd_match(
+            StoredVal,
+            m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+                 m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+    return SDValue();
+
+  // SrcVal must be a matching normal load further up the chain.
+  auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+  if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+      Ld->getBasePtr() != St->getBasePtr() ||
+      Ld->getOffset() != St->getOffset() ||
+      !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
     return SDValue();
 
   // Ensure the shift amount is in bounds.
@@ -53423,7 +53421,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
                                             SDNodeFlags::NoUnsignedWrap);
 
   // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
-  SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+  SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
   X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
 
   SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 87a54a0b9148d..c197a83835506 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1029,144 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-LABEL: reset_multiload_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 40(%esp,%eax), %edx
-; X86-NEXT:    movl 44(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl 32(%esp,%eax), %edi
-; X86-NEXT:    movl 36(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    andl %ebx, 4(%eax)
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    notl %edi
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl $96, %ebx
-; X86-NEXT:    shrl $3, %ebx
-; X86-NEXT:    movl (%eax,%ebx), %ebx
-; X86-NEXT:    andl %edi, (%eax)
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %esi, 12(%eax)
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl %edx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    btl %ecx, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    btrl %edx, %ebx
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl %ebx, (%ecx,%esi)
 ; X86-NEXT:    jae .LBB22_2
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB22_2:
-; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_multiload_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %rax, %rsi
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movl %ecx, %r9d
-; SSE-NEXT:    andl $96, %r9d
-; SSE-NEXT:    shrl $3, %r9d
-; SSE-NEXT:    movl (%rdi,%r9), %r9d
-; SSE-NEXT:    btl %ecx, %r9d
-; SSE-NEXT:    jb .LBB22_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    movl (%rdx), %eax
-; SSE-NEXT:  .LBB22_2:
-; SSE-NEXT:    andq %rsi, (%rdi)
-; SSE-NEXT:    andq %r8, 8(%rdi)
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    movl $1, %r8d
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %r8, %rsi
-; AVX2-NEXT:    shlxq %rcx, %r8, %r8
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %r8, %rsi
-; AVX2-NEXT:    cmovneq %rax, %r8
-; AVX2-NEXT:    notq %rsi
-; AVX2-NEXT:    notq %r8
-; AVX2-NEXT:    movl %ecx, %r9d
-; AVX2-NEXT:    andl $96, %r9d
-; AVX2-NEXT:    shrl $3, %r9d
-; AVX2-NEXT:    movl (%rdi,%r9), %r9d
-; AVX2-NEXT:    btl %ecx, %r9d
-; AVX2-NEXT:    jb .LBB22_2
-; AVX2-NEXT:  # %bb.1:
-; AVX2-NEXT:    movl (%rdx), %eax
-; AVX2-NEXT:  .LBB22_2:
-; AVX2-NEXT:    andq %r8, (%rdi)
-; AVX2-NEXT:    andq %rsi, 8(%rdi)
-; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %r8d
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %r8, %rsi
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    shlxq %rcx, %r8, %r8
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %r8, %rsi
-; AVX512-NEXT:    cmovneq %rax, %r8
-; AVX512-NEXT:    notq %rsi
-; AVX512-NEXT:    notq %r8
-; AVX512-NEXT:    movl %ecx, %r9d
-; AVX512-NEXT:    andl $96, %r9d
-; AVX512-NEXT:    shrl $3, %r9d
-; AVX512-NEXT:    movl (%rdi,%r9), %r9d
-; AVX512-NEXT:    btl %ecx, %r9d
-; AVX512-NEXT:    jb .LBB22_2
-; AVX512-NEXT:  # %bb.1:
-; AVX512-NEXT:    movl (%rdx), %eax
-; AVX512-NEXT:  .LBB22_2:
-; AVX512-NEXT:    andq %r8, (%rdi)
-; AVX512-NEXT:    andq %rsi, 8(%rdi)
-; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT:    retq
+; X64-LABEL: reset_multiload_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %r9d
+; X64-NEXT:    movl %r9d, %r8d
+; X64-NEXT:    btrl %esi, %r8d
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    btl %esi, %r9d
+; X64-NEXT:    jb .LBB22_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:  .LBB22_2:
+; X64-NEXT:    movl %r8d, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs

From 85c78274358717e4d5d019a801decba5c1add484 Mon Sep 17 00:00:00 2001
From: Dan Blackwell <dan_blackwell@apple.com>
Date: Tue, 4 Nov 2025 11:03:59 +0000
Subject: [PATCH 144/313] [ASan][Test-Only] Allow read of size 2 in strcmp.c
 test (#166179)

Occasionally this test fails on Darwin due to `CHECK: READ of size 1`
not matching the actual output `CHECK: READ of size 2`. This can happen
when the memory before the string `s1` happens to match the first
character of `s2`.

This patch allows for `READ of size 2` to pass in order to account for
the above circumstances.

rdar://151317947
---
 compiler-rt/test/asan/TestCases/strcmp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/asan/TestCases/strcmp.c b/compiler-rt/test/asan/TestCases/strcmp.c
index 417bd491ebe02..2b31e64768c42 100644
--- a/compiler-rt/test/asan/TestCases/strcmp.c
+++ b/compiler-rt/test/asan/TestCases/strcmp.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   assert(strcmp(s1 - 1, s2));
 
   // CHECK: {{.*ERROR: AddressSanitizer: stack-buffer-underflow on address}}
-  // CHECK: READ of size 1
+  // Very rarely `s1[-1]` happens to be '1', resulting in `strcmp` needing to
+  // check 2 bytes before failing, rather than 1 - this should still pass
+  // CHECK: READ of size {{[12]}}
   return 0;
 }

From f8656ed4557500345ba29128d9ec85ef1a637240 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 4 Nov 2025 11:37:56 +0000
Subject: [PATCH 145/313] [llvm][dwarfdump] Add --child-tags option to filter
 by DWARF tags (#165720)

This patch adds a new option `--child-tags` (`-t` for short), which
makes dwarfdump only dump children whose DWARF tag is in the list of
tags specified by the user.

Motivating examples are:
* dumping all global variables in a CU
* dumping all non-static data members of a structure
* dumping all module import declarations of a CU
* etc.

For tags not known to dwarfdump, we pretend that the tag wasn't
specified.

Note, this flag only takes effect when `--show-children` is set (either
explicitly or implicitly). We error out when trying to use the flag
without dumping children.

Example:
```
$ builds/release/bin/llvm-dwarfdump -t DW_TAG_structure_type a.out.dSYM
...
0x0000000c: DW_TAG_compile_unit
              DW_AT_producer    ("clang version 22.0.0git (git@github.com:Michael137/llvm-project.git 737da3347c2fb01dd403420cf83e9b8fbea32618)")
              DW_AT_language    (DW_LANG_C11)
              ...
0x0000002a:   DW_TAG_structure_type
                DW_AT_APPLE_block       (true)
                DW_AT_byte_size (0x20)

0x00000067:   DW_TAG_structure_type
                DW_AT_APPLE_block       (true)
                DW_AT_name      ("__block_descriptor")
                DW_AT_byte_size (0x10)
...
```

```
$ builds/release/bin/llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member a.out.dSYM
...
0x0000000c: DW_TAG_compile_unit
              DW_AT_producer    ("clang version 22.0.0git (git@github.com:Michael137/llvm-project.git 737da3347c2fb01dd403420cf83e9b8fbea32618)")
              DW_AT_language    (DW_LANG_C11)
              DW_AT_name        ("macro.c")
              ...
0x0000002a:   DW_TAG_structure_type
                DW_AT_APPLE_block       (true)
                DW_AT_byte_size (0x20)

0x0000002c:     DW_TAG_member
                  DW_AT_name    ("__isa")
                  DW_AT_type    (0x00000051 "void *")
                  DW_AT_data_member_location    (0x00)

0x00000033:     DW_TAG_member
                  DW_AT_name    ("__flags")
                  DW_AT_type    (0x00000052 "int")
                  DW_AT_data_member_location    (0x08)

0x0000003a:     DW_TAG_member
                  DW_AT_name    ("__reserved")
                  DW_AT_type    (0x00000052 "int")
                  DW_AT_data_member_location    (0x0c)

0x00000041:     DW_TAG_member
                  DW_AT_name    ("__FuncPtr")
                  DW_AT_type    (0x00000056 "void (*)(int)")
                  DW_AT_data_member_location    (0x10)

0x00000048:     DW_TAG_member
                  DW_AT_name    ("__descriptor")
                  DW_AT_type    (0x00000062 "__block_descriptor *")
                  DW_AT_alignment       (8)
                  DW_AT_data_member_location    (0x18)

0x00000067:   DW_TAG_structure_type
                DW_AT_APPLE_block       (true)
                DW_AT_name      ("__block_descriptor")
                DW_AT_byte_size (0x10)

0x0000006a:     DW_TAG_member
                  DW_AT_name    ("reserved")
                  DW_AT_type    (0x00000079 "unsigned long")
                  DW_AT_data_member_location    (0x00)

0x00000071:     DW_TAG_member
                  DW_AT_name    ("Size")
                  DW_AT_type    (0x00000079 "unsigned long")
                  DW_AT_data_member_location    (0x08)
...
```
---
 llvm/docs/CommandGuide/llvm-dwarfdump.rst     |   9 ++
 llvm/include/llvm/DebugInfo/DIContext.h       |   2 +
 llvm/lib/DebugInfo/DWARF/DWARFDie.cpp         |   4 +-
 .../llvm-dwarfdump/X86/filter-child-tag.yaml  | 136 ++++++++++++++++++
 llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp  |  18 +++
 5 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml

diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
index 137830259eb64..dfc0431f07826 100644
--- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst
+++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
@@ -134,6 +134,15 @@ OPTIONS
 
             Abbreviate the description of type unit entries.
 
+.. option:: -t, --filter-child-tag
+
+            Only dump children whose DWARF tag is one of the specified tags.
+            Example usage:
+
+            .. code-block:: c
+
+              llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member -c
+
 .. option:: -x, --regex
 
             Treat any <name> strings as regular expressions when searching
diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index e7e87bbfebf38..b404c92e71836 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -211,6 +211,8 @@ struct DIDumpOptions {
   bool ShowAggregateErrors = false;
   bool PrintRegisterOnly = false;
   std::string JsonErrSummaryFile;
+  /// List of DWARF tags to filter children by.
+  llvm::SmallVector<unsigned, 0> FilterChildTag;
   std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)>
       GetNameForDWARFReg;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 6c78ef05e1b61..7496c5a084da4 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
           DIDumpOptions ChildDumpOpts = DumpOpts;
           ChildDumpOpts.ShowParents = false;
           while (Child) {
-            Child.dump(OS, Indent + 2, ChildDumpOpts);
+            if (DumpOpts.FilterChildTag.empty() ||
+                llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag()))
+              Child.dump(OS, Indent + 2, ChildDumpOpts);
             Child = Child.getSibling();
           }
         }
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml
new file mode 100644
index 0000000000000..2a8c37da80e64
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml
@@ -0,0 +1,136 @@
+## Tests the --filter-child-tag (-t) option.
+
+# RUN: yaml2obj %s -o %t.o
+
+# RUN: llvm-dwarfdump %t.o --filter-child-tag=DW_TAG_structure_type | FileCheck %s --check-prefix=ONLY_STRUCT
+
+# ONLY_STRUCT: DW_TAG_compile_unit
+# ONLY_STRUCT-NOT: DW_TAG_namespace
+# ONLY_STRUCT-NOT: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -t DW_TAG_structure_type -t DW_TAG_namespace | \
+# RUN:     FileCheck %s --check-prefix=STRUCT_AND_NS --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_member
+
+# STRUCT_AND_NS: DW_TAG_compile_unit
+# STRUCT_AND_NS: DW_TAG_namespace
+# STRUCT_AND_NS: DW_TAG_structure_type
+# STRUCT_AND_NS: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=FOO_MEM --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace
+
+# FOO_MEM: DW_TAG_structure_type
+# FOO_MEM: DW_TAG_member
+# FOO_MEM: DW_TAG_member
+# FOO_MEM: DW_TAG_member
+# FOO_MEM-NOT: DW_TAG_structure_type
+# FOO_MEM-NOT: DW_TAG_member
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=SINGLE_INVALID_TAG --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace
+
+# SINGLE_INVALID_TAG: DW_TAG_structure_type
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG-NOT: DW_TAG_structure_type
+# SINGLE_INVALID_TAG-NOT: DW_TAG_member
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag | \
+# RUN:     FileCheck %s --check-prefix=ONLY_INVALID_TAGS --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace --implicit-check-not=DW_TAG_member
+
+# ONLY_INVALID_TAGS: DW_TAG_structure_type
+# ONLY_INVALID_TAGS-NOT: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -c -p --name=Foo -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=FOO_MEM_WITH_PARENT --implicit-check-not=DW_TAG_subprogram
+
+# FOO_MEM_WITH_PARENT: DW_TAG_compile_unit
+# FOO_MEM_WITH_PARENT: DW_TAG_namespace
+# FOO_MEM_WITH_PARENT: DW_TAG_structure_type
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT-NOT: DW_TAG_structure_type
+# FOO_MEM_WITH_PARENT-NOT: DW_TAG_member
+
+## Not specifying --show-children ignores the --filter-child-tag option.
+# RUN: llvm-dwarfdump %t.o --name=Foo -t DW_TAG_member 2>&1 | FileCheck %s --check-prefix=NO_SHOW_CHILDREN
+
+# NO_SHOW_CHILDREN: DW_TAG_structure_type
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+  debug_abbrev:
+    - Table:
+      - Tag:      DW_TAG_compile_unit
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_producer
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_namespace
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_structure_type
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_member
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_subprogram
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+  debug_info:
+    - Version: 5
+      UnitType: DW_UT_compile
+      Entries:
+        - AbbrCode: 1
+          Values:
+            - CStr: handwritten
+        - AbbrCode: 2
+          Values:
+            - CStr: ns
+        - AbbrCode: 3
+          Values:
+          - CStr: Foo
+        - AbbrCode: 4
+          Values:
+            - CStr: mem1
+        - AbbrCode: 4
+          Values:
+            - CStr: mem2
+        - AbbrCode: 4
+          Values:
+            - CStr: mem3
+        - AbbrCode: 3
+          Values:
+            - CStr: NestedInFoo
+        - AbbrCode: 4
+          Values:
+            - CStr: NestedMem1
+        - AbbrCode: 4
+          Values:
+            - CStr: NestedMem2
+        - AbbrCode: 5
+          Values:
+            - CStr: NestedFunc
+        - AbbrCode: 0x0
+        - AbbrCode: 5
+          Values:
+            - CStr: FooFunc
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 11eb58ea911df..6f120f93700f6 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
@@ -242,6 +243,15 @@ static opt<bool>
                 cat(DwarfDumpCategory));
 static alias ShowParentsAlias("p", desc("Alias for --show-parents."),
                               aliasopt(ShowParents), cl::NotHidden);
+
+static list<std::string> FilterChildTag(
+    "filter-child-tag",
+    desc("When --show-children is specified, show only DIEs with the "
+         "specified DWARF tags."),
+    value_desc("list of DWARF tags"), cat(DwarfDumpCategory));
+static alias FilterChildTagAlias("t", desc("Alias for --filter-child-tag."),
+                                 aliasopt(FilterChildTag), cl::NotHidden);
+
 static opt<bool>
     ShowForm("show-form",
              desc("Show DWARF form types after the DWARF attribute types."),
@@ -330,6 +340,13 @@ static cl::extrahelp
 /// @}
 //===----------------------------------------------------------------------===//
 
+static llvm::SmallVector<unsigned>
+makeTagVector(const list<std::string> &TagStrings) {
+  return llvm::map_to_vector(TagStrings, [](const std::string &Tag) {
+    return llvm::dwarf::getTag(Tag);
+  });
+}
+
 static void error(Error Err) {
   if (!Err)
     return;
@@ -356,6 +373,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) {
   DumpOpts.ShowAddresses = !Diff;
   DumpOpts.ShowChildren = ShowChildren;
   DumpOpts.ShowParents = ShowParents;
+  DumpOpts.FilterChildTag = makeTagVector(FilterChildTag);
   DumpOpts.ShowForm = ShowForm;
   DumpOpts.SummarizeTypes = SummarizeTypes;
   DumpOpts.Verbose = Verbose;

From c80faaefe10fcf3bac1c41b78e038d8d7533dd71 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 4 Nov 2025 11:33:11 +0000
Subject: [PATCH 146/313] [lldb][test] registration-unique.test: don't hardcode
 the order of recognizer registration

These might get registered in a different order on different platforms.
Use `CHECK-DAG` to account for that.

Attempts to fix the failure seen on the x86 Linux bots:
```
******************** TEST 'lldb-shell :: Recognizer/registration-unique.test' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
RUN: at line 4
lit-file C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp
executed command: split-file 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp'
note: command had no output on stdout or stderr
RUN: at line 6
\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc -fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp -g -o C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out
executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe' --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp' -g -o 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out'
.---command stderr------------
| clang: warning: argument unused during compilation: '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' [-Wunused-command-line-argument]
`-----------------------------
RUN: at line 7
\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe --no-lldbinit -S C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet -b -s C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out | c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test
executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe' --no-lldbinit -S 'C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet' -b -s 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out'
note: command had no output on stdout or stderr
executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test'
.---command stderr------------
| C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test:45:10: error: CHECK: expected string not found in input
| # CHECK: Assert StackFrame Recognizer
|          ^
| <stdin>:20:38: note: scanning from here
| 1: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg
|                                      ^
| <stdin>:34:10: note: possible intended match here
| 3: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg
|          ^
|
| Input file: <stdin>
| Check file: C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test
|
| -dump-input=help explains the following input dump.
|
| Input was:
| <<<<<<
```
---
 lldb/test/Shell/Recognizer/registration-unique.test | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test
index f6052bb57c940..bc1e4a6ea235b 100644
--- a/lldb/test/Shell/Recognizer/registration-unique.test
+++ b/lldb/test/Shell/Recognizer/registration-unique.test
@@ -41,14 +41,14 @@ frame recognizer list
 # CHECK-NEXT: no matching results found.
 
 # CHECK: frame recognizer list
-# CHECK: Verbose Trap StackFrame Recognizer
-# CHECK: Assert StackFrame Recognizer
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
 # CHECK-NOT: Verbose Trap StackFrame Recognizer
 # CHECK-NOT: Assert StackFrame Recognizer
 
 # FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341
 # CHECK: frame recognizer list
-# CHECK: Verbose Trap StackFrame Recognizer
-# CHECK: Assert StackFrame Recognizer
-# CHECK: Verbose Trap StackFrame Recognizer
-# CHECK: Assert StackFrame Recognizer
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer

From 0c653514cc2f7e1005fadc330a3f3f904a5a7a52 Mon Sep 17 00:00:00 2001
From: Pablo Antonio Martinez <pamartin@amd.com>
Date: Tue, 4 Nov 2025 12:47:18 +0100
Subject: [PATCH 147/313] [mlir][rocdl] Add GlobalLoadAsyncToLDS operation
 (#165374)

Adds `global.load.async.to.lds` op to rocdl, supporting `b8`, `b32`, `b64` and `b128`. The op is lowered to the appropriate `llvm.amdgcn.global.load.async.to.lds.bXX` intrinsic.

This is available on gfx1250+
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 32 ++++++++++++++++++++
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 13 ++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 13 ++++++++
 3 files changed, 58 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 5241f9a6f2b43..921fdf36a59b0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -692,6 +692,38 @@ def ROCDL_GlobalLoadLDSOp :
   }];
 }
 
+//===---------------------------------------------------------------------===//
+// Async load to LDS intrinsic (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+foreach bitsVal = [8, 32, 64, 128] in {
+  defvar bitsStr = "b" # !cast<string>(bitsVal);
+  def ROCDL_GlobalLoadAsyncToLDS # !toupper(bitsStr) # Op :
+    ROCDL_IntrOp<"global.load.async.to.lds." # bitsStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+    dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+                   Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                   I32Attr:$offset,
+                   I32Attr:$aux);
+    let arguments = !con(args, baseArgs);
+    let assemblyFormat = [{
+      $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
+      attr-dict `:` type($globalPtr) `,` type($ldsPtr)
+    }];
+    let description = [{
+      Asynchronously loads }] # !cast<string>(bitsVal) # [{ bits of data from a global memory pointer
+      to a Local Data Share (LDS) pointer.
+
+      Available on gfx1250+.
+    }];
+
+    let extraClassDefinition = [{
+      ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+        return {getGlobalPtr(), getLdsPtr()};
+      }
+    }];
+  }
+}
+
 //===---------------------------------------------------------------------===//
 // Tensor load/store intrinsics (available in GFX1250)
 //===---------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e703600c71c8e..5e857599b65ea 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK-LABEL @rocdl.global.load.async.to.lds
+  // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>, <3>
+  llvm.return
+}
+
 // CHECK-LABEL @rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 8a848221a50dd..3fbd9e0567948 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.global.load.async.to.lds
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  llvm.return
+}
+
 // CHECK-LABEL: rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {

From 99a1fcad5de62d849cc5ac31a8b3412ead5b5518 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 4 Nov 2025 11:59:43 +0000
Subject: [PATCH 148/313] [UTC] Update AMDGPU asm regexp for private functions
 (#166169)

Since #163011 changed AMDGPU to use ELF mangling, the regexp failed to
match private functions because of the inconsistent presence/absence of
the .L prefix on the first line of the function e.g.:
```
.Lfoo:  ; @foo
```
---
 llvm/test/CodeGen/AMDGPU/private-function.ll | 16 ++++++++++++++++
 llvm/utils/UpdateTestChecks/asm.py           |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/private-function.ll

diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll
new file mode 100644
index 0000000000000..8eefc9dfc5d7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/private-function.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define private void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  ret void
+}
+
+@var = global ptr @foo
diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index 61f0d679f806d..82377862885c8 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -51,9 +51,9 @@ class string:
 )
 
 ASM_FUNCTION_AMDGPU_RE = re.compile(
-    r"\.type\s+_?(?P<func>[^,\n]+),@function\n"
+    r"\.type\s+(_|\.L)?(?P<func>[^,\n]+),@function\n"
     r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?"
-    r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
+    r'^(_|\.L)?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
     r"(?P<body>.*?)\n"  # (body of the function)
     # This list is incomplete
     r"^\s*(\.Lfunc_end[0-9]+:\n|\.section)",

From f59beca459481554aef0d069f4ef4a0a66880593 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 4 Nov 2025 12:00:24 +0000
Subject: [PATCH 149/313] [X86] bittest-big-integer.ll - add test showing
 dependent BTC/BT/BTS sequence on same i128 all with unknown bit indices
 (#166351)

---
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 234 +++++++++++++++++++
 1 file changed, 234 insertions(+)

diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c197a83835506..040ae65a33251 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1082,3 +1082,237 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
   %ret = select i1 %cmp, i32 %sel, i32 0
   ret i32 %ret
 }
+
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $144, %esp
+; X86-NEXT:    movb 20(%ebp), %ch
+; X86-NEXT:    movb 12(%ebp), %cl
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NEXT:    movl 60(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl 20(%ebp), %ecx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl 12(%eax), %esi
+; X86-NEXT:    xorl (%eax), %edi
+; X86-NEXT:    xorl 4(%eax), %ebx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    andb $96, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 96(%esp,%eax), %eax
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl %ebx, 4(%ecx)
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: sequence_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    movl $1, %r8d
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %r8, %rsi
+; SSE-NEXT:    movl $1, %r9d
+; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    xorl %r11d, %r11d
+; SSE-NEXT:    testb $64, %cl
+; SSE-NEXT:    cmovneq %r9, %rsi
+; SSE-NEXT:    cmovneq %r11, %r9
+; SSE-NEXT:    xorl %r10d, %r10d
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shldq %cl, %r8, %r10
+; SSE-NEXT:    shlq %cl, %r8
+; SSE-NEXT:    testb $64, %al
+; SSE-NEXT:    cmovneq %r8, %r10
+; SSE-NEXT:    cmovneq %r11, %r8
+; SSE-NEXT:    xorq 8(%rdi), %rsi
+; SSE-NEXT:    xorq (%rdi), %r9
+; SSE-NEXT:    movl %edx, %ecx
+; SSE-NEXT:    andb $32, %cl
+; SSE-NEXT:    movq %r9, %rax
+; SSE-NEXT:    shrdq %cl, %rsi, %rax
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    shrq %cl, %r11
+; SSE-NEXT:    testb $64, %dl
+; SSE-NEXT:    cmoveq %rax, %r11
+; SSE-NEXT:    btl %edx, %r11d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    orq %r10, %rsi
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    movq %r9, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: sequence_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    movl $1, %r10d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %r10, %rsi
+; AVX2-NEXT:    shlxq %rcx, %r10, %r8
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %r8, %rsi
+; AVX2-NEXT:    cmovneq %r9, %r8
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shldq %cl, %r10, %r11
+; AVX2-NEXT:    shlxq %rax, %r10, %r10
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    cmovneq %r10, %r11
+; AVX2-NEXT:    cmovneq %r9, %r10
+; AVX2-NEXT:    xorq 8(%rdi), %rsi
+; AVX2-NEXT:    xorq (%rdi), %r8
+; AVX2-NEXT:    movl %edx, %ecx
+; AVX2-NEXT:    andb $32, %cl
+; AVX2-NEXT:    movq %r8, %rax
+; AVX2-NEXT:    shrdq %cl, %rsi, %rax
+; AVX2-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    testb $64, %dl
+; AVX2-NEXT:    cmoveq %rax, %rcx
+; AVX2-NEXT:    btl %edx, %ecx
+; AVX2-NEXT:    setae %al
+; AVX2-NEXT:    orq %r11, %rsi
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq %r8, (%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sequence_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %ecx, %eax
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %r9d
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %r9, %rsi
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    shlxq %rcx, %r9, %r8
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %r8, %rsi
+; AVX512-NEXT:    cmovneq %r10, %r8
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shldq %cl, %r9, %r11
+; AVX512-NEXT:    shlxq %rax, %r9, %r9
+; AVX512-NEXT:    testb $64, %al
+; AVX512-NEXT:    cmovneq %r9, %r11
+; AVX512-NEXT:    cmovneq %r10, %r9
+; AVX512-NEXT:    xorq 8(%rdi), %rsi
+; AVX512-NEXT:    xorq (%rdi), %r8
+; AVX512-NEXT:    movl %edx, %ecx
+; AVX512-NEXT:    andb $32, %cl
+; AVX512-NEXT:    movq %r8, %rax
+; AVX512-NEXT:    shrdq %cl, %rsi, %rax
+; AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT:    testb $64, %dl
+; AVX512-NEXT:    cmoveq %rax, %rcx
+; AVX512-NEXT:    btl %edx, %ecx
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    orq %r11, %rsi
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq %rsi, 8(%rdi)
+; AVX512-NEXT:    retq
+  %rem0 = and i32 %pos0, 127
+  %rem1 = and i32 %pos1, 127
+  %rem2 = and i32 %pos2, 127
+  %ofs0 = zext nneg i32 %rem0 to i128
+  %ofs1 = zext nneg i32 %rem1 to i128
+  %ofs2 = zext nneg i32 %rem2 to i128
+  %bit0 = shl nuw i128 1, %ofs0
+  %bit1 = shl nuw i128 1, %ofs1
+  %bit2 = shl nuw i128 1, %ofs2
+  %ld = load i128, ptr %word
+  %res0 = xor i128 %ld, %bit0
+  %test1 = and i128 %res0, %bit1
+  %cmp1 = icmp eq i128 %test1, 0
+  %res2 = or i128 %res0, %bit2
+  store i128 %res2, ptr %word
+  ret i1 %cmp1
+}

From f037f413506af9e32898e102f391175a3a2852ef Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 4 Nov 2025 12:00:44 +0000
Subject: [PATCH 150/313] [IR] Add new function attribute nocreateundeforpoison
 (#164809)

Also add a corresponding intrinsic property that can be used to mark
intrinsics that do not introduce poison, for example simple arithmetic
intrinsics that propagate poison just like a simple arithmetic
instruction.

As a smoke test this patch adds the new property to
llvm.amdgcn.fmul.legacy.
---
 clang/test/CodeGen/X86/math-builtins.c        | 322 +++++++++---------
 clang/test/CodeGen/builtin-sqrt.c             |   2 +-
 clang/test/CodeGen/libcalls.c                 |  20 +-
 clang/test/CodeGen/math-libcalls.c            | 232 ++++++-------
 .../cl20-device-side-enqueue-attributes.cl    |   2 +-
 llvm/docs/LangRef.rst                         |   5 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |   1 +
 llvm/include/llvm/IR/Attributes.td            |   5 +
 llvm/include/llvm/IR/Intrinsics.td            |  85 +++--
 llvm/lib/Analysis/ValueTracking.cpp           |  76 +----
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   2 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   2 +
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   1 +
 llvm/test/Bitcode/attributes.ll               |   6 +
 .../AArch64/replace-with-veclib-armpl.ll      |   3 +-
 .../AMDGPU/amdgpu-simplify-libcall-rootn.ll   |   2 +-
 .../replace-with-veclib-sleef-scalable.ll     |   2 +-
 llvm/test/Feature/intrinsics.ll               |   5 +-
 llvm/test/Linker/drop-attribute.ll            |   2 +-
 llvm/test/Transforms/Attributor/nofree.ll     |   6 +-
 llvm/test/Transforms/Attributor/nosync.ll     |   2 +-
 llvm/test/Transforms/Attributor/willreturn.ll |  16 +-
 .../EarlyCSE/replace-calls-def-attrs.ll       |   2 +-
 llvm/test/Transforms/LoopIdiom/basic.ll       |   2 +-
 .../AArch64/expand-exp.ll                     |   2 +-
 .../Transforms/SimplifyCFG/rangereduce.ll     |   2 +-
 .../remove-attributes-from-intrinsics.ll      |   2 +-
 .../TableGen/Basic/CodeGenIntrinsics.cpp      |   2 +
 llvm/utils/TableGen/Basic/CodeGenIntrinsics.h |   3 +
 .../utils/TableGen/Basic/IntrinsicEmitter.cpp |   8 +-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir |   2 +-
 31 files changed, 406 insertions(+), 418 deletions(-)

diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index a56f8ba1ee385..c7cd9ffdd6966 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -118,36 +118,36 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
 
-// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f); __builtin_fabsf128(f);
 
-// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_frexp(f,i);    __builtin_frexpf(f,i);   __builtin_frexpl(f,i); __builtin_frexpf128(f,i);
 
-// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall(); __builtin_huge_valf128();
 
@@ -165,10 +165,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_ldexp(f,f);    __builtin_ldexpf(f,f);   __builtin_ldexpl(f,f);  __builtin_ldexpf128(f,f);
 
-// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
@@ -180,7 +180,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]]
-// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
 // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]]
@@ -209,10 +209,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_pow(f,f);        __builtin_powf(f,f);       __builtin_powl(f,f); __builtin_powf128(f,f);
 
-// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -220,12 +220,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_powi(f,f);        __builtin_powif(f,f);       __builtin_powil(f,f);
 
-// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
 
   /* math */
   __builtin_acos(f);       __builtin_acosf(f);      __builtin_acosl(f); __builtin_acosf128(f);
@@ -307,21 +307,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_ceil(f);       __builtin_ceilf(f);      __builtin_ceill(f); __builtin_ceilf128(f);
 
-// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_cos(f);        __builtin_cosf(f);       __builtin_cosl(f); __builtin_cosf128(f);
 
-// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -362,10 +362,10 @@ __builtin_erfc(f);       __builtin_erfcf(f);      __builtin_erfcl(f); __builtin_
 
 __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_expf128(f);
 
-// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -373,10 +373,10 @@ __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_e
 
 __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_exp2f128(f);
 
-// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -384,10 +384,10 @@ __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_
 
 __builtin_exp10(f);       __builtin_exp10f(f);      __builtin_exp10l(f); __builtin_exp10f128(f);
 
-// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -417,22 +417,22 @@ __builtin_fdim(f,f);       __builtin_fdimf(f,f);      __builtin_fdiml(f,f); __bu
 
 __builtin_floor(f);      __builtin_floorf(f);     __builtin_floorl(f); __builtin_floorf128(f);
 
-// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f);  __builtin_fmaf16(f,f,f);
 
-// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]]
-// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC2]]
+// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -454,25 +454,25 @@ __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f);
 
 __builtin_fmax(f,f);       __builtin_fmaxf(f,f);      __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f);
 
-// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_fmin(f,f);       __builtin_fminf(f,f);      __builtin_fminl(f,f); __builtin_fminf128(f,f);
 
-// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_hypot(f,f);      __builtin_hypotf(f,f);     __builtin_hypotl(f,f); __builtin_hypotf128(f,f);
 
@@ -509,10 +509,10 @@ __builtin_lgamma(f);     __builtin_lgammaf(f);    __builtin_lgammal(f); __builti
 
 __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builtin_llrintf128(f);
 
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -520,10 +520,10 @@ __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builti
 
 __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __builtin_llroundf128(f);
 
-// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -531,10 +531,10 @@ __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __built
 
 __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_logf128(f);
 
-// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -542,10 +542,10 @@ __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_l
 
 __builtin_log10(f);      __builtin_log10f(f);     __builtin_log10l(f); __builtin_log10f128(f);
 
-// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -564,10 +564,10 @@ __builtin_log1p(f);      __builtin_log1pf(f);     __builtin_log1pl(f); __builtin
 
 __builtin_log2(f);       __builtin_log2f(f);      __builtin_log2l(f); __builtin_log2f128(f);
 
-// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -586,10 +586,10 @@ __builtin_logb(f);       __builtin_logbf(f);      __builtin_logbl(f); __builtin_
 
 __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin_lrintf128(f);
 
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -597,10 +597,10 @@ __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin
 
 __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __builtin_lroundf128(f);
 
-// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -608,14 +608,14 @@ __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __built
 
 __builtin_nearbyint(f);  __builtin_nearbyintf(f); __builtin_nearbyintl(f); __builtin_nearbyintf128(f);
 
-// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_nextafter(f,f);  __builtin_nextafterf(f,f); __builtin_nextafterl(f,f); __builtin_nextafterf128(f,f);
 
@@ -663,25 +663,25 @@ __builtin_remquo(f,f,i);  __builtin_remquof(f,f,i); __builtin_remquol(f,f,i); __
 
 __builtin_rint(f);       __builtin_rintf(f);      __builtin_rintl(f); __builtin_rintf128(f);
 
-// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_round(f);      __builtin_roundf(f);     __builtin_roundl(f); __builtin_roundf128(f);
 
-// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_scalbln(f,f);    __builtin_scalblnf(f,f);   __builtin_scalblnl(f,f); __builtin_scalblnf128(f,f);
 
@@ -707,10 +707,10 @@ __builtin_scalbn(f,f);     __builtin_scalbnf(f,f);    __builtin_scalbnl(f,f); __
 
 __builtin_sin(f);        __builtin_sinf(f);       __builtin_sinl(f); __builtin_sinf128(f);
 
-// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -747,10 +747,10 @@ __builtin_sincospi(f,d,d); __builtin_sincospif(f,fp,fp); __builtin_sincospil(f,l
 
 __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
 
-// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -791,22 +791,24 @@ __builtin_tgamma(f);     __builtin_tgammaf(f);    __builtin_tgammal(f); __builti
 
 __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
-// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
 };
 
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
+// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGen/builtin-sqrt.c b/clang/test/CodeGen/builtin-sqrt.c
index 2313a68d2d0e2..3ebf2ac91ccdf 100644
--- a/clang/test/CodeGen/builtin-sqrt.c
+++ b/clang/test/CodeGen/builtin-sqrt.c
@@ -11,5 +11,5 @@ float foo(float X) {
 // HAS_ERRNO-NOT: attributes [[ATTR]] = {{{.*}} memory(none)
 
 // NO_ERRNO: declare float @llvm.sqrt.f32(float) [[ATTR:#[0-9]+]]
-// NO_ERRNO: attributes [[ATTR]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// NO_ERRNO: attributes [[ATTR]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 
diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c
index 1e4b06e34aaf9..923719f3ec8e4 100644
--- a/clang/test/CodeGen/libcalls.c
+++ b/clang/test/CodeGen/libcalls.c
@@ -74,9 +74,9 @@ void test_fma(float a0, double a1, long double a2) {
 // CHECK-YES: declare float @fmaf(float noundef, float noundef, float noundef)
 // CHECK-YES: declare double @fma(double noundef, double noundef, double noundef)
 // CHECK-YES: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef)
-// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RN2:#[0-9]+]]
-// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RN2]]
-// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RNI]]
+// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RNI]]
 
 // Just checking to make sure these library functions are marked readnone
 void test_builtins(double d, float f, long double ld) {
@@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan_ = atan(d);
   long double atanl_ = atanl(ld);
   float atanf_ = atanf(f);
-// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]]
-// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]]
-// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RN2:#[0-9]+]]
+// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RN2]]
 // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]]
 // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atanf(float noundef) [[NUW]]
@@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan2_ = atan2(d, 2);
   long double atan2l_ = atan2l(ld, ld);
   float atan2f_ = atan2f(f, f);
-// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]]
-// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]]
-// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RN2]]
+// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RN2]]
 // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]]
 // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]]
@@ -124,4 +124,4 @@ void test_builtins(double d, float f, long double ld) {
 }
 
 // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" }
-// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index ad297828f48ed..d4cd6f86b3c51 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -35,27 +35,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   copysign(f,f); copysignf(f,f);copysignl(f,f);
 
-  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   fabs(f);       fabsf(f);      fabsl(f);
 
-  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 
   frexp(f,i);    frexpf(f,i);   frexpl(f,i);
 
@@ -86,7 +86,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
   // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
+  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
   // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
   // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]]
@@ -107,9 +107,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   pow(f,f);        powf(f,f);       powl(f,f);
 
-// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -206,21 +206,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   ceil(f);       ceilf(f);      ceill(f);
 
-// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ceil.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ceil.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ceil.f80(
 
   cos(f);        cosf(f);       cosl(f);
 
-// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -266,9 +266,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   exp(f);        expf(f);       expl(f);
 
-// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -278,9 +278,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   exp2(f);       exp2f(f);      exp2l(f);
 
-// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -314,21 +314,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   floor(f);      floorf(f);     floorl(f);
 
-// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.floor.f64
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.floor.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.floor.f80(
 
   fma(f,f,f);        fmaf(f,f,f);       fmal(f,f,f);
 
-// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -350,39 +350,39 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   fmax(f,f);       fmaxf(f,f);      fmaxl(f,f);
 
-// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.maxnum.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.maxnum.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(
 
   fmin(f,f);       fminf(f,f);      fminl(f,f);
 
-// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.minnum.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.minnum.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.minnum.f80(
 
   fmaximum_num(*d,*d);       fmaximum_numf(f,f);      fmaximum_numl(*l,*l);
 
-// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC]]
-// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC]]
-// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   fminimum_num(*d,*d);       fminimum_numf(f,f);      fminimum_numl(*l,*l);
 
-// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC]]
-// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC]]
-// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   hypot(f,f);      hypotf(f,f);     hypotl(f,f);
 
@@ -422,9 +422,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   llrint(f);     llrintf(f);    llrintl(f);
 
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -434,9 +434,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   llround(f);    llroundf(f);   llroundl(f);
 
-// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -446,9 +446,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log(f);        logf(f);       logl(f);
 
-// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -458,9 +458,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log10(f);      log10f(f);     log10l(f);
 
-// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -482,9 +482,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log2(f);       log2f(f);      log2l(f);
 
-// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -506,9 +506,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   lrint(f);      lrintf(f);     lrintl(f);
 
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -518,9 +518,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   lround(f);     lroundf(f);    lroundl(f);
 
-// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -530,12 +530,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   nearbyint(f);  nearbyintf(f); nearbyintl(f);
 
-// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.nearbyint.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.nearbyint.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(
@@ -590,24 +590,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   rint(f);       rintf(f);      rintl(f);
 
-// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.rint.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.rint.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.rint.f80(
 
   round(f);      roundf(f);     roundl(f);
 
-// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.round.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.round.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.round.f80(
@@ -638,9 +638,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   sin(f);        sinf(f);       sinl(f);
 
-// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -674,9 +674,9 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
 
   sqrt(f);       sqrtf(f);      sqrtl(f);
 
-// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -722,20 +722,22 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
 
   trunc(f);      truncf(f);     truncl(f);
 
-// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 };
 
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
+// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
index ea1f734391614..5cbf6452d4c85 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -199,7 +199,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
 // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" }
 //.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3c089b5a0ba79..b9507a2d054fe 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2741,6 +2741,11 @@ For example:
 ``"nooutline"``
     This attribute indicates that outlining passes should not modify the
     function.
+``nocreateundeforpoison``
+    This attribute indicates that the result of the function (prior to
+    application of return attributes/metadata) will not be undef or poison if
+    all arguments are not undef and not poison. Otherwise, it is undefined
+    behavior.
 
 Call Site Attributes
 ----------------------
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 464f475098ec5..b0c5beae631ce 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -801,6 +801,7 @@ enum AttributeKindCodes {
   ATTR_KIND_CAPTURES = 102,
   ATTR_KIND_DEAD_ON_RETURN = 103,
   ATTR_KIND_SANITIZE_ALLOC_TOKEN = 104,
+  ATTR_KIND_NO_CREATE_UNDEF_OR_POISON = 105,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 8ce2b1bea8fac..c086a39616249 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -183,6 +183,11 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>;
 /// Specify how the pointer may be captured.
 def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>;
 
+/// Result will not be undef or poison if all arguments are not undef and not
+/// poison.
+def NoCreateUndefOrPoison
+    : EnumAttr<"nocreateundeforpoison", IntersectAnd, [FnAttr]>;
+
 /// Function is not a source of divergence.
 def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>;
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4d59ee8676b9e..6a079f62dd9cf 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -186,6 +186,10 @@ def IntrSpeculatable : IntrinsicProperty;
 // defined by the hasSideEffects property of the TableGen Instruction class.
 def IntrHasSideEffects : IntrinsicProperty;
 
+// Result will not be undef or poison if all arguments are not undef and not
+// poison.
+def IntrNoCreateUndefOrPoison : IntrinsicProperty;
+
 //===----------------------------------------------------------------------===//
 // IIT constants and utils
 //===----------------------------------------------------------------------===//
@@ -1039,7 +1043,7 @@ def int_experimental_memset_pattern
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_fma  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>,
                             LLVMMatchType<0>]>;
@@ -1052,16 +1056,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   // environment so they can be treated as readnone.
   def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>;
-  def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_pow  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_log  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
@@ -1080,12 +1076,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_roundeven    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
-  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
-  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
 
   // Truncate a floating point number with a specific rounding mode
   def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
@@ -1097,6 +1087,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_arithmetic_fence : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
                                                    [IntrNoMem]>;
 
+  // If the value doesn't fit an unspecified value is returned, but this
+  // is not poison so we can still mark these as IntrNoCreateUndefOrPoison.
   def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
   def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
   def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
@@ -1110,29 +1102,50 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_frexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty, llvm_anyint_ty], [LLVMMatchType<0>]>;
 }
 
+// TODO: Move all of these into the IntrNoCreateUndefOrPoison case above.
+let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+  // These functions do not read memory, but are sensitive to the
+  // rounding mode. LLVM purposely does not model changes to the FP
+  // environment so they can be treated as readnone.
+  def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+}
+
 def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 
 // Internal interface for object size checking
@@ -1164,7 +1177,7 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
 def int_is_fpclass
     : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                             [llvm_anyfloat_ty, llvm_i32_ty],
-                            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
+                            [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<1>>]>;
 
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
@@ -1406,7 +1419,7 @@ def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty],
 //
 
 // None of these intrinsics accesses memory at all.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
@@ -1521,7 +1534,7 @@ def int_adjust_trampoline : DefaultAttrsIntrinsic<
 //
 
 // Expose the carry flag from add operations on two integrals.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
@@ -1547,16 +1560,16 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
 //
 def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>;
 def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>;
 def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable]>;
@@ -1611,22 +1624,22 @@ def int_abs : DefaultAttrsIntrinsic<
 
 def int_smax : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_smin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_umax : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_umin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_scmp : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
-    [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>;
 def int_ucmp : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
-    [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
@@ -1868,7 +1881,7 @@ def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_
 }
 
 // Saturating floating point to integer intrinsics
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
 def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 }
@@ -1890,7 +1903,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty],
 // First argument must be pointer or vector of pointer. This is checked by the
 // verifier.
 def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty],
-                           [IntrNoMem, IntrSpeculatable]>;
+                           [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 
 // Intrinsic to wrap a thread local variable.
 def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>],
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0a72076f51824..523374bdc472f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
         if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue())
           return false;
         break;
-      case Intrinsic::ctpop:
-      case Intrinsic::bswap:
-      case Intrinsic::bitreverse:
-      case Intrinsic::fshl:
-      case Intrinsic::fshr:
-      case Intrinsic::smax:
-      case Intrinsic::smin:
-      case Intrinsic::scmp:
-      case Intrinsic::umax:
-      case Intrinsic::umin:
-      case Intrinsic::ucmp:
-      case Intrinsic::ptrmask:
-      case Intrinsic::fptoui_sat:
-      case Intrinsic::fptosi_sat:
-      case Intrinsic::sadd_with_overflow:
-      case Intrinsic::ssub_with_overflow:
-      case Intrinsic::smul_with_overflow:
-      case Intrinsic::uadd_with_overflow:
-      case Intrinsic::usub_with_overflow:
-      case Intrinsic::umul_with_overflow:
-      case Intrinsic::sadd_sat:
-      case Intrinsic::uadd_sat:
-      case Intrinsic::ssub_sat:
-      case Intrinsic::usub_sat:
-        return false;
       case Intrinsic::sshl_sat:
       case Intrinsic::ushl_sat:
-        return includesPoison(Kind) &&
-               !shiftAmountKnownInRange(II->getArgOperand(1));
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::pow:
-      case Intrinsic::log:
-      case Intrinsic::log10:
-      case Intrinsic::log2:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::exp10:
-      case Intrinsic::fabs:
-      case Intrinsic::copysign:
-      case Intrinsic::floor:
-      case Intrinsic::ceil:
-      case Intrinsic::trunc:
-      case Intrinsic::rint:
-      case Intrinsic::nearbyint:
-      case Intrinsic::round:
-      case Intrinsic::roundeven:
-      case Intrinsic::fptrunc_round:
-      case Intrinsic::canonicalize:
-      case Intrinsic::arithmetic_fence:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-      case Intrinsic::minimum:
-      case Intrinsic::maximum:
-      case Intrinsic::minimumnum:
-      case Intrinsic::maximumnum:
-      case Intrinsic::is_fpclass:
-      case Intrinsic::ldexp:
-      case Intrinsic::frexp:
-        return false;
-      case Intrinsic::lround:
-      case Intrinsic::llround:
-      case Intrinsic::lrint:
-      case Intrinsic::llrint:
-        // If the value doesn't fit an unspecified value is returned (but this
-        // is not poison).
-        return false;
+        if (!includesPoison(Kind) ||
+            shiftAmountKnownInRange(II->getArgOperand(1)))
+          return false;
+        break;
       }
     }
     [[fallthrough]];
   case Instruction::CallBr:
   case Instruction::Invoke: {
     const auto *CB = cast<CallBase>(Op);
-    return !CB->hasRetAttr(Attribute::NoUndef);
+    return !CB->hasRetAttr(Attribute::NoUndef) &&
+           !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison);
   }
   case Instruction::InsertElement:
   case Instruction::ExtractElement: {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 1bdf3e9f684f5..8930d64de5e37 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Captures;
   case bitc::ATTR_KIND_DEAD_ON_RETURN:
     return Attribute::DeadOnReturn;
+  case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON:
+    return Attribute::NoCreateUndefOrPoison;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index f17656c7c3b03..76494c792ac7b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_CAPTURES;
   case Attribute::DeadOnReturn:
     return bitc::ATTR_KIND_DEAD_ON_RETURN;
+  case Attribute::NoCreateUndefOrPoison:
+    return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5ba6f95f5fae8..608661583c3db 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
       case Attribute::CoroDestroyOnlyWhenComplete:
       case Attribute::CoroElideSafe:
       case Attribute::NoDivergenceSource:
+      case Attribute::NoCreateUndefOrPoison:
         continue;
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index aef7810fe2c3b..107a98aebeeb8 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -521,6 +521,11 @@ define void @f_sanitize_alloc_token() sanitize_alloc_token {
         ret void;
 }
 
+; CHECK: define void @f_no_create_undef_or_poison() #56
+define void @f_no_create_undef_or_poison() nocreateundeforpoison {
+        ret void;
+}
+
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -633,6 +638,7 @@ define void @dead_on_return(ptr dead_on_return %p) {
 ; CHECK: attributes #53 = { sanitize_realtime }
 ; CHECK: attributes #54 = { sanitize_realtime_blocking }
 ; CHECK: attributes #55 = { sanitize_alloc_token }
+; CHECK: attributes #56 = { nocreateundeforpoison }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 71c6380177b3a..8a0ac6d4ace7a 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 {
 
 attributes #0 = { "target-features"="+sve" }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index f6ae516faf2f7..89d039406f375 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -1489,7 +1489,7 @@ attributes #2 = { noinline }
 !0 = !{float 3.0}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { strictfp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { noinline }
 ; CHECK: attributes #[[ATTR4]] = { nobuiltin }
diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
index c489bc3681876..aa63552eb4b63 100644
--- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
@@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
 ;.
diff --git a/llvm/test/Feature/intrinsics.ll b/llvm/test/Feature/intrinsics.ll
index b6abc0fff6db7..a2da8f29116e1 100644
--- a/llvm/test/Feature/intrinsics.ll
+++ b/llvm/test/Feature/intrinsics.ll
@@ -64,7 +64,7 @@ define void @libm() {
 
 ; FIXME: test ALL the intrinsics in this file.
 
-; CHECK: declare void @llvm.trap() #1
+; CHECK: declare void @llvm.trap() #2
 declare void @llvm.trap()
 
 define void @trap() {
@@ -72,5 +72,4 @@ define void @trap() {
   ret void
 }
 
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+; CHECK: attributes #2 = { cold noreturn nounwind memory(inaccessiblemem: write) }
diff --git a/llvm/test/Linker/drop-attribute.ll b/llvm/test/Linker/drop-attribute.ll
index 9be95a89109b4..3d4c13c2ffc75 100644
--- a/llvm/test/Linker/drop-attribute.ll
+++ b/llvm/test/Linker/drop-attribute.ll
@@ -39,7 +39,7 @@ define void @test_nocallback_definition() nocallback {
 declare void @test_nocallback_call_site()
 
 ; Test that checks that nocallback attribute on an intrinsic is NOT dropped.
-; CHECK: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn
+; CHECK: ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn
 ; CHECK-NEXT: declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.sqrt.f32(float) nocallback
 
diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll
index 2a9d5d91ae053..94aa79aa327f4 100644
--- a/llvm/test/Transforms/Attributor/nofree.ll
+++ b/llvm/test/Transforms/Attributor/nofree.ll
@@ -238,7 +238,7 @@ define void @call_both() #0 {
 
 ; TEST 10 (positive case)
 ; Call intrinsic function
-; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -489,7 +489,7 @@ attributes #2 = { nobuiltin nounwind }
 ; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable }
-; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR7]] = { nofree nounwind }
 ; TUNIT: attributes #[[ATTR8]] = { nobuiltin nofree nounwind }
 ; TUNIT: attributes #[[ATTR9]] = { nosync memory(none) }
@@ -506,7 +506,7 @@ attributes #2 = { nobuiltin nounwind }
 ; CGSCC: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
-; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR7]] = { nofree nounwind }
 ; CGSCC: attributes #[[ATTR8]] = { nobuiltin nofree nounwind }
 ; CGSCC: attributes #[[ATTR9]] = { nosync memory(none) }
diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
index 7ef46e8e94c9e..c15bd775ddb4d 100644
--- a/llvm/test/Transforms/Attributor/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -454,7 +454,7 @@ define void @nosync_convergent_callee_test() {
 ; CHECK: attributes #[[ATTR14:[0-9]+]] = { convergent memory(none) }
 ; CHECK: attributes #[[ATTR15]] = { memory(none) }
 ; CHECK: attributes #[[ATTR16]] = { nounwind }
-; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR18]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR19]] = { nosync memory(none) }
 ; CHECK: attributes #[[ATTR20]] = { nofree nounwind }
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index b7ac7fc2970b0..d65480b05759a 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -276,7 +276,7 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a
 
 ; TEST 6 (positive case)
 ; Call intrinsic function
-; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -425,7 +425,7 @@ define i32 @loop_constant_trip_count(ptr nocapture readonly %0) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP9:%.*]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP8]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP8]] = add nsw i32 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9]] = add nuw nsw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 10
@@ -472,7 +472,7 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, ptr nocapture readonly %2, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP14]], [[TMP8]] ], [ 0, [[TMP4]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !invariant.load [[META0]]
 ; CHECK-NEXT:    [[TMP14]] = add nsw i32 [[TMP13]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP15]] = add i32 [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
@@ -522,7 +522,7 @@ define i32 @loop_trip_dec(i32 %0, ptr nocapture readonly %1) local_unnamed_addr
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP12:%.*]], [[TMP6]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP11:%.*]], [[TMP6]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !invariant.load [[META0]]
 ; CHECK-NEXT:    [[TMP11]] = add nsw i32 [[TMP10]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP12]] = add nsw i64 [[TMP7]], -1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i64 [[TMP7]], 0
@@ -1294,7 +1294,7 @@ attributes #1 = { uwtable noinline }
 ; TUNIT: attributes #[[ATTR5]] = { noreturn }
 ; TUNIT: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable }
 ; TUNIT: attributes #[[ATTR7]] = { noinline nounwind uwtable }
-; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn }
 ; TUNIT: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable }
 ; TUNIT: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable }
@@ -1332,7 +1332,7 @@ attributes #1 = { uwtable noinline }
 ; CGSCC: attributes #[[ATTR5]] = { noreturn }
 ; CGSCC: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable }
 ; CGSCC: attributes #[[ATTR7]] = { noinline nounwind uwtable }
-; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn }
 ; CGSCC: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable }
 ; CGSCC: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable }
@@ -1364,3 +1364,7 @@ attributes #1 = { uwtable noinline }
 ; CGSCC: attributes #[[ATTR37]] = { nosync willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR38]] = { willreturn memory(read) }
 ;.
+; TUNIT: [[META0]] = !{}
+;.
+; CGSCC: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
index cf871e5714bf5..1dbffd962a638 100644
--- a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
+++ b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
@@ -262,7 +262,7 @@ define i32 @commutative_intrinsic_intersection_failure(i32 %arg, i32 %arg1) {
 }
 
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { memory(none) }
 ; CHECK: attributes #[[ATTR2]] = { memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { alwaysinline memory(none) }
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index e8ea912246728..9deccf5352ea8 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -1620,5 +1620,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
index 09f583f9242d5..3416584729317 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -38,5 +38,5 @@ define <4 x float> @fixed_vec_exp(<4 x float> %input) {
 declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0
 
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
index d1fba91d1e505..169803f7aa012 100644
--- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -321,7 +321,7 @@ three:
 !1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17}
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5}
diff --git a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
index a6ace2246fa8e..b800f9aa97c8f 100644
--- a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
+++ b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
@@ -26,7 +26,7 @@ define i32 @t(i32 %a) {
 ; CHECK-ALL: declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 
-; CHECK-ALL: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK-ALL: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 
 ; CHECK-INTERESTINGNESS: attributes #1 = {
 ; CHECK-INTERESTINGNESS-SAME: "arg4"
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index cd866469792a2..ff894853b9771 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -407,6 +407,8 @@ void CodeGenIntrinsic::setProperty(const Record *R) {
     hasSideEffects = true;
   else if (R->getName() == "IntrStrictFP")
     isStrictFP = true;
+  else if (R->getName() == "IntrNoCreateUndefOrPoison")
+    isNoCreateUndefOrPoison = true;
   else if (R->isSubClassOf("NoCapture")) {
     unsigned ArgNo = R->getValueAsInt("ArgNo");
     addArgAttribute(ArgNo, NoCapture);
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 2e86149514f46..15e803c4feba1 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -114,6 +114,9 @@ struct CodeGenIntrinsic {
   // True if the intrinsic is marked as strictfp.
   bool isStrictFP = false;
 
+  // True if the intrinsic is marked as IntrNoCreateUndefOrPoison.
+  bool isNoCreateUndefOrPoison = false;
+
   enum ArgAttrKind {
     NoCapture,
     NoAlias,
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 75dffb18fca5a..452d2b08f25c3 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -421,7 +421,8 @@ static bool compareFnAttributes(const CodeGenIntrinsic *L,
     return std::tie(I->canThrow, I->isNoDuplicate, I->isNoMerge, I->isNoReturn,
                     I->isNoCallback, I->isNoSync, I->isNoFree, I->isWillReturn,
                     I->isCold, I->isConvergent, I->isSpeculatable,
-                    I->hasSideEffects, I->isStrictFP);
+                    I->hasSideEffects, I->isStrictFP,
+                    I->isNoCreateUndefOrPoison);
   };
 
   auto TieL = TieBoolAttributes(L);
@@ -446,7 +447,8 @@ static bool hasFnAttributes(const CodeGenIntrinsic &Int) {
   return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync ||
          Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate ||
          Int.isNoMerge || Int.isConvergent || Int.isSpeculatable ||
-         Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown();
+         Int.isStrictFP || Int.isNoCreateUndefOrPoison ||
+         getEffectiveME(Int) != MemoryEffects::unknown();
 }
 
 namespace {
@@ -605,6 +607,8 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) {
       addAttribute("Speculatable");
     if (Int.isStrictFP)
       addAttribute("StrictFP");
+    if (Int.isNoCreateUndefOrPoison)
+      addAttribute("NoCreateUndefOrPoison");
 
     const MemoryEffects ME = getEffectiveME(Int);
     if (ME != MemoryEffects::unknown()) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 60bd24a27868e..1e4cf8d4589cb 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -1308,7 +1308,7 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) {
 // CHECK-DAG: declare float @llvm.cos.f32(float)
 // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0
 // CHECK-DAG: declare { float, float } @llvm.sincos.f32(float)
-// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) #0
+// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>)
 // CHECK-DAG: declare float @llvm.copysign.f32(float, float)
 // CHECK-DAG: declare float @llvm.rint.f32(float)
 // CHECK-DAG: declare double @llvm.rint.f64(double)

From 0a95a86634a8a8d2e95a390828e345058b4a723f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 4 Nov 2025 12:02:58 +0000
Subject: [PATCH 151/313] [VPlan] Fix first-lane comment in sinkScalarOperands
 (NFC) (#166347)

To follow-up on a post-commit review.
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index e8e87aa8a4f3c..2588c878d8472 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -198,8 +198,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
     VPSingleDefRecipe *SinkCandidate;
     std::tie(SinkTo, SinkCandidate) = WorkList[I];
 
-    // All recipe users of the sink candidate must be in the same block SinkTo
-    // or all users outside of SinkTo must have only their first lane used. In
+    // All recipe users of SinkCandidate must be in the same block SinkTo or all
+    // users outside of SinkTo must only use the first lane of SinkCandidate. In
     // the latter case, we need to duplicate SinkCandidate.
     auto UsersOutsideSinkTo =
         make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {

From dbce71382c65450edd7e02d62268e8b7dfcb7ed6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 4 Nov 2025 12:03:16 +0000
Subject: [PATCH 152/313] [AMDGPU] Skip debug instructions when eliminating
 S_SET_GPR_IDX_ON/OFF (#160715)

---
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp  |  2 +-
 .../CodeGen/AMDGPU/set-gpr-idx-peephole.mir   | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..abefa32b8f802 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
                                          E = MI.getIterator();
        I != E; ++I) {
-    if (I->isBundle())
+    if (I->isBundle() || I->isDebugInstr())
       continue;
     switch (I->getOpcode()) {
     case AMDGPU::S_SET_GPR_IDX_MODE:
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index 002d43f937837..131656975ec40 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 
 ---
@@ -40,6 +41,27 @@ body:             |
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
 ...
 
+---
+name:            meta_in_between
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: meta_in_between
+    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: KILL $sgpr0
+    ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  KILL $sgpr0
+  $sgpr0 = IMPLICIT_DEF
+  S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+...
+
 ---
 name:            valu_write_in_between
 body:             |

From 8636c40f4e5451fd9a6b64c22abe0e8d561f7c89 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 4 Nov 2025 12:01:15 +0000
Subject: [PATCH 153/313] [lldb][test] Skip registration-unique.test on Windows

Fails with:
```
RUN: at line 4
split-file C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp
executed command: split-file 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp'
note: command had no output on stdout or stderr
RUN: at line 6
c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc -fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp -g -o C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out
executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe' --target=specify-a-target-or-use-a-_host-substitution --target=aarch64-pc-windows-msvc '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/main.cpp' -g -o 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out'
.---command stderr------------
| clang: warning: argument unused during compilation: '-fmodules-cache-path=C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/module-cache-clang\lldb-shell' [-Wunused-command-line-argument]
`-----------------------------
RUN: at line 7
c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe --no-lldbinit -S C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet -b -s C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out | c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test
executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\lldb.exe' --no-lldbinit -S 'C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb\test\Shell\lit-lldb-init-quiet' -b -s 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/commands.input' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\test\Shell\Recognizer\Output\registration-unique.test.tmp/cpp.out'
note: command had no output on stdout or stderr
executed command: 'c:\users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\filecheck.exe' 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test'
.---command stderr------------
| C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test:45:10: error: CHECK: expected string not found in input
| # CHECK: Assert StackFrame Recognizer
|          ^
| <stdin>:20:38: note: scanning from here
| 1: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg
|                                      ^
| <stdin>:34:10: note: possible intended match here
| 3: Verbose Trap StackFrame Recognizer, demangled symbol regex ^__clang_trap_msg
|          ^
|
| Input file: <stdin>
| Check file: C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\Shell\Recognizer\registration-unique.test
|
| -dump-input=help explains the following input dump.
```
---
 lldb/test/Shell/Recognizer/registration-unique.test | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test
index bc1e4a6ea235b..34400d9a27575 100644
--- a/lldb/test/Shell/Recognizer/registration-unique.test
+++ b/lldb/test/Shell/Recognizer/registration-unique.test
@@ -1,3 +1,5 @@
+# UNSUPPORTED: system-windows
+
 # Checks that the recognizers that should work across language runtimes
 # are only registered once with the target.
 

From 51269e220da64637b780791a28fb187cbc36084d Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Tue, 4 Nov 2025 13:01:08 +0100
Subject: [PATCH 154/313] [bazel] Fix bazel build for
 bb4ed55acdbc7f48bc978147189e8106e3ea42f8

---
 utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index ae9ff2878e03b..3ded8bdf89514 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -80,6 +80,7 @@ cc_library(
     hdrs = glob(["LanguageRuntime/CPlusPlus/*.h"]),
     includes = [".."],
     deps = [
+        "//clang:codegen",
         "//lldb:CoreHeaders",
         "//lldb:Headers",
         "//lldb:SymbolHeaders",

From bea31dd373e3f053f0b3f1862c6b106831e1f25d Mon Sep 17 00:00:00 2001
From: Karlo Basioli <basioli@google.com>
Date: Tue, 4 Nov 2025 13:09:32 +0100
Subject: [PATCH 155/313] Fix bazel build issue after #166157 (#166358)

---
 utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 3ded8bdf89514..81d4894e2bf1e 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -87,6 +87,7 @@ cc_library(
         "//lldb:TargetHeaders",
         "//lldb:Utility",
         "//llvm:Support",
+        "//clang:codegen",
     ],
 )
 

From 8cb0c0ce1be95e0d0a9b2f366844d1a4e11d5268 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <basioli@google.com>
Date: Tue, 4 Nov 2025 13:11:21 +0100
Subject: [PATCH 156/313] Revert "Fix bazel build issue after #166157"
 (#166359)

Reverts llvm/llvm-project#166358 as this was fixed by
51269e220da64637b780791a28fb187cbc36084d
---
 utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 81d4894e2bf1e..3ded8bdf89514 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -87,7 +87,6 @@ cc_library(
         "//lldb:TargetHeaders",
         "//lldb:Utility",
         "//llvm:Support",
-        "//clang:codegen",
     ],
 )
 

From 28a20b4af94465155f64228746adc07a82368687 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Tue, 4 Nov 2025 12:39:04 +0000
Subject: [PATCH 157/313] [VectorCombine] Avoid inserting freeze when
 scalarizing extend-extract if all extracts would lead to UB on poison.
 (#164683)

This change aims to avoid inserting a freeze instruction between the
load and bitcast when scalarizing extend-extract. This is particularly
useful in combination with
https://github.com/llvm/llvm-project/pull/164682, which can then
potentially further scalarize, provided there is no freeze.

alive2 proof: https://alive2.llvm.org/ce/z/W-GD88
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  27 ++-
 .../VectorCombine/AArch64/ext-extract.ll      | 186 ++++++++++++++++++
 2 files changed, 211 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00da11dc8..27a8bbd5776be 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
 
   Value *ScalarV = Ext->getOperand(0);
   if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
-                                 &DT))
-    ScalarV = Builder.CreateFreeze(ScalarV);
+                                 &DT)) {
+    // Check wether all lanes are extracted, all extracts trigger UB
+    // on poison, and the last extract (and hence all previous ones)
+    // are guaranteed to execute if Ext executes.  If so, we do not
+    // need to insert a freeze.
+    SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
+    bool AllExtractsTriggerUB = true;
+    ExtractElementInst *LastExtract = nullptr;
+    BasicBlock *ExtBB = Ext->getParent();
+    for (User *U : Ext->users()) {
+      auto *Extract = cast<ExtractElementInst>(U);
+      if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
+        AllExtractsTriggerUB = false;
+        break;
+      }
+      ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
+      if (!LastExtract || LastExtract->comesBefore(Extract))
+        LastExtract = Extract;
+    }
+    if (ExtractedLanes.size() != DstTy->getNumElements() ||
+        !AllExtractsTriggerUB ||
+        !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(),
+                                                    LastExtract->getIterator()))
+      ScalarV = Builder.CreateFreeze(ScalarV);
+  }
   ScalarV = Builder.CreateBitCast(
       ScalarV,
       IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
index 60700412686ea..e7b11cdf8475e 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
@@ -346,3 +346,189 @@ entry:
   call void @use.i32(i32 %ext.3)
   ret void
 }
+
+define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 255
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP0]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
+
+define noundef i32 @zext_v4i8_not_all_lanes_used(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_not_all_lanes_used(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP0]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.3
+  ret i32 %add2
+}
+
+define i32 @zext_v4i8_all_lanes_used_no_ub(<4 x i8> %src) {
+; CHECK-LABEL: define i32 @zext_v4i8_all_lanes_used_no_ub(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
+
+define noundef i32 @zext_v4i8_extracts_different_blocks(<4 x i8> %src, i1 %cond) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_extracts_different_blocks(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP4]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[PHI]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  br i1 %cond, label %then, label %else
+
+then:
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  br label %exit
+
+else:
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+  br label %exit
+
+exit:
+  %phi = phi i32 [ %ext.2, %then ], [ %ext.3, %else ]
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %phi
+  ret i32 %add2
+}
+
+
+declare void @may_throw() willreturn
+
+define noundef i32 @zext_v4i8_throwing_call_between(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_throwing_call_between(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    call void @may_throw()
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  call void @may_throw()
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}

From 0307147105d569e45f1d5a7e81d128e90fd648be Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 4 Nov 2025 14:45:53 +0200
Subject: [PATCH 158/313] [NFC][SPIRV] Add AMDGCN SPIR-V specific defaults to
 the BE (#165815)

AMDGCN flavoured SPIR-V has slightly different defaults from what the BE
adopts: it assumes all extensions are enabled, and expects nonsemantic
debug info to be generated. Furthermore, it is necessary to encode in
the resulting SPIR-V binary that what was generated was AMDGCN
flavoured, which we do by setting the Generator Version to `UINT16_MAX`
(which matches what we expect to see at reverse translation). We will
register this generator version at
<https://github.com/KhronosGroup/SPIRV-Headers>. This is a preliminary
patch out of a series of patches that are needed for adopting the BE for
AMDGCN flavoured SPIR-V generation.
---
 llvm/lib/MC/SPIRVObjectWriter.cpp                     |  7 +++++--
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp              | 11 +++++++++--
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp          |  3 ++-
 .../CodeGen/SPIRV/debug-info/debug-type-pointer.ll    |  4 +++-
 llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll |  2 ++
 .../CodeGen/SPIRV/extensions/enable-all-extensions.ll |  1 +
 .../SPIRV/physical-layout/generator-magic-number.ll   |  2 ++
 .../CodeGen/SPIRV/physical-layout/spirv-version.ll    |  2 ++
 8 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 5e3713778286f..d693ea33d8d7b 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSPIRVObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
@@ -17,8 +18,10 @@ using namespace llvm;
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
   constexpr uint32_t GeneratorID = 43;
-  constexpr uint32_t GeneratorMagicNumber =
-      (GeneratorID << 16) | (LLVM_VERSION_MAJOR);
+  const uint32_t GeneratorMagicNumber =
+      Asm.getContext().getTargetTriple().getVendor() == Triple::AMD
+          ? UINT16_MAX
+          : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR));
   constexpr uint32_t Schema = 0;
 
   W.write<uint32_t>(MagicNumber);
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index ba09692fec515..ad6c9cd421b7c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
     SPIRVVersion = VersionTuple(1, 3);
     break;
   case Triple::SPIRVSubArch_v14:
-  default:
     SPIRVVersion = VersionTuple(1, 4);
     break;
   case Triple::SPIRVSubArch_v15:
@@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
   case Triple::SPIRVSubArch_v16:
     SPIRVVersion = VersionTuple(1, 6);
     break;
+  default:
+    if (TT.getVendor() == Triple::AMD)
+      SPIRVVersion = VersionTuple(1, 6);
+    else
+      SPIRVVersion = VersionTuple(1, 4);
   }
   OpenCLVersion = VersionTuple(2, 2);
 
   // Set the environment based on the target triple.
   if (TargetTriple.getOS() == Triple::Vulkan)
     Env = Shader;
-  else if (TargetTriple.getEnvironment() == Triple::OpenCL)
+  else if (TargetTriple.getEnvironment() == Triple::OpenCL ||
+           TargetTriple.getVendor() == Triple::AMD)
     Env = Kernel;
   else
     Env = Unknown;
@@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
   // Set the default extensions based on the target triple.
   if (TargetTriple.getVendor() == Triple::Intel)
     Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers);
+  if (TargetTriple.getVendor() == Triple::AMD)
+    Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple);
 
   // The order of initialization is important.
   initAvailableExtensions(Extensions);
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 5ba035682238b..2951a4bc695e2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI(
     cl::Optional, cl::init(false));
 
 void SPIRVPassConfig::addPreEmitPass() {
-  if (SPVEnableNonSemanticDI) {
+  if (SPVEnableNonSemanticDI ||
+      getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) {
     addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>()));
   }
 }
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
index ec4884ff643cb..3e0d0cc4cd8e2 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
@@ -1,7 +1,9 @@
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
+; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION
-; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.   
+; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.
 ; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-MIR-DAG:   [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
index b1a555a52f40d..6b4e35e997124 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
@@ -7,6 +7,8 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
 
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
+
 ; CHECK-EXTENSION: OpCapability OptNoneEXT
 ; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone"
 ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
index f745794e11de1..15905dd1894e2 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
 
 define i6 @getConstantI6() {
   ret i6 2
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
index afffd9e69b454..11e7d006c5ecf 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: spirv-tools
 ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 ; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}}
+; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}}
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
index 686c1e97257ad..49ee9931d1126 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
@@ -6,6 +6,7 @@
 ; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14
 ; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15
 ; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV
 
 ; CHECK-SPIRV10: Version: 1.0
 ; CHECK-SPIRV11: Version: 1.1
@@ -14,3 +15,4 @@
 ; CHECK-SPIRV14: Version: 1.4
 ; CHECK-SPIRV15: Version: 1.5
 ; CHECK-SPIRV16: Version: 1.6
+; AMDGCNSPIRV: Version: 1.6

From 747050bcceca18d32dc1140461984ec2c30ae96a Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Tue, 4 Nov 2025 13:52:23 +0100
Subject: [PATCH 159/313] [MLIR][XeGPU][VectorToXeGPU] Lower
 vector.load/store/transfer_read/transfer_write to new offsets syntax
 (#162095)

Changes the `VectorToXeGPU` pass to generate `xegpu.load_nd/store_nd`
ops using new syntax with where offsets are specified at the load/store
ops level.
```mlir
// from this
%desc = xegpu.create_nd_tdesc %src[%off1, %off2]: memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%res = xegpu.load_nd %desc : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>

// to this
%desc = xegpu.create_nd_tdesc %src: memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
%res = xegpu.load_nd %desc[%off1, %off2] : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
```

In order to support cases with dimension reduction at the
`create_nd_tdesc` level (e.g. `memref<8x8x16xf16> ->
tensor_desc<8x16xf16>` it was decided to insert a memref.subview that
collapses the source shape to 2d, for example:

```mlir
// input:
%0 = vector.load %source[%off0, %off1, %off2] : memref<8x16x32xf32>, vector<8x16xf32>

// --vector-to-xegpu (old)
%tdesc = xegpu.create_nd_tdesc %source[%off0, %off1, %off2] : memref<8x16x32xf32> -> tdesc<8x32xf32>
%vec = xegpu.load_nd %tdesc

// --vector-to-xegpu (new)
%collapsed = memref.subview %source[%off0, 0, 0] [1, 16, 32] [1, 1, 1] :
    memref<8x16x32xf32> -> memref<16x32xf32, strided<[32, 1], offset: ?>>
%tdesc = xegpu.create_nd_tdesc %collapsed : memref<16x32xf32, ...> -> tdesc<8x32xf32>
%vec = xegpu.load_nd %tdesc[%off1, %off2]
```

<details><summary>Why we need to change that?</summary>

```mlir
// reduce dim and apply all 3 offsets at load_nd
%desc = xegpu.create_nd_tdesc %source : memref<8x16x32xf32> -> !xegpu.tensor_desc<16x32xf32>
// error: xegpu.load_nd len(offsets) != desc.rank
%res = xegpu.load_nd %desc[%off, %off, %off] : !xegpu.tensor_desc<16x32xf32> -> vector<8x16xf32>
```

</details>

---------

Signed-off-by: dchigarev <dmitry.chigarev@intel.com>
---
 .../VectorToXeGPU/VectorToXeGPU.cpp           | 174 +++++++++++-------
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        |  12 +-
 .../VectorToXeGPU/load-to-xegpu.mlir          |  39 ++--
 .../VectorToXeGPU/store-to-xegpu.mlir         |  39 ++--
 .../VectorToXeGPU/transfer-read-to-xegpu.mlir |  53 +++---
 .../transfer-write-to-xegpu.mlir              |  51 +++--
 6 files changed, 196 insertions(+), 172 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 91c1aa55fdb4e..abea84f6b01fe 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -97,57 +97,23 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter,
   return success();
 }
 
-static xegpu::CreateNdDescOp
-createNdDescriptor(PatternRewriter &rewriter, Location loc,
-                   xegpu::TensorDescType descType, TypedValue<MemRefType> src,
-                   Operation::operand_range offsets) {
+static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter,
+                                                Location loc,
+                                                xegpu::TensorDescType descType,
+                                                TypedValue<MemRefType> src) {
   MemRefType srcTy = src.getType();
   auto [strides, offset] = srcTy.getStridesAndOffset();
 
   xegpu::CreateNdDescOp ndDesc;
   if (srcTy.hasStaticShape()) {
-    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
-                                           getAsOpFoldResult(offsets));
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src);
   } else {
     // In case of any dynamic shapes, source's shape and strides have to be
     // explicitly provided.
-    SmallVector<Value> sourceDims;
-    unsigned srcRank = srcTy.getRank();
-    for (unsigned i = 0; i < srcRank; ++i)
-      sourceDims.push_back(memref::DimOp::create(rewriter, loc, src, i));
-
-    SmallVector<int64_t> constOffsets;
-    SmallVector<Value> dynOffsets;
-    for (Value offset : offsets) {
-      std::optional<int64_t> staticVal = getConstantIntValue(offset);
-      if (!staticVal)
-        dynOffsets.push_back(offset);
-      constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic));
-    }
-
-    SmallVector<Value> dynShapes;
-    for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) {
-      if (shape == ShapedType::kDynamic)
-        dynShapes.push_back(sourceDims[idx]);
-    }
-
-    // Compute strides in reverse order.
-    SmallVector<Value> dynStrides;
-    Value accStride = arith::ConstantIndexOp::create(rewriter, loc, 1);
-    // Last stride is guaranteed to be static and unit.
-    for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) {
-      accStride =
-          arith::MulIOp::create(rewriter, loc, accStride, sourceDims[i + 1]);
-      if (strides[i] == ShapedType::kDynamic)
-        dynStrides.push_back(accStride);
-    }
-    std::reverse(dynStrides.begin(), dynStrides.end());
-
-    ndDesc = xegpu::CreateNdDescOp::create(
-        rewriter, loc, descType, src, dynOffsets, dynShapes, dynStrides,
-        DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets),
-        DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()),
-        DenseI64ArrayAttr::get(rewriter.getContext(), strides));
+    auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src);
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
+                                           meta.getConstifiedMixedSizes(),
+                                           meta.getConstifiedMixedStrides());
   }
 
   return ndDesc;
@@ -392,6 +358,62 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
       .getResult();
 }
 
+// Collapses shapes of a nD memref to the target rank while applying offsets for
+// the collapsed dimensions. Returns the new memref value and the remaining
+// offsets for the last targetRank dimensions. For example:
+//   input: %memref = memref<2x4x8x32xf32>, offsets=[%i0, %i1, %i2, %i3],
+//   output: %memref[%i0, %i1, 0, 0] -> memref<8x32xf32>, offsets: [%i2, %i3]
+static std::pair<Value, SmallVector<OpFoldResult>>
+convertMemrefAndOffsetsToTargetRank(PatternRewriter &rewriter, Location loc,
+                                    Value memref,
+                                    SmallVector<OpFoldResult> offsets,
+                                    int64_t targetRank) {
+  auto memrefType = cast<MemRefType>(memref.getType());
+  unsigned rank = memrefType.getRank();
+
+  if (rank <= targetRank)
+    return {memref, offsets};
+
+  int64_t numCombinedDims = rank - targetRank;
+  SmallVector<OpFoldResult> subviewOffsets;
+  SmallVector<OpFoldResult> subviewSizes;
+  SmallVector<OpFoldResult> subviewStrides;
+
+  // For the combined dimensions: use the provided offsets, size=1, stride=1
+  for (unsigned i = 0; i < numCombinedDims; ++i) {
+    subviewOffsets.push_back(offsets[i]);
+    subviewSizes.push_back(rewriter.getI64IntegerAttr(1));
+    subviewStrides.push_back(rewriter.getI64IntegerAttr(1));
+  }
+
+  // For the last targetRank dimensions: offset=0, use full size, stride=1
+  SmallVector<int64_t> resultShape;
+  auto originalShape = memrefType.getShape();
+  auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, memref);
+  for (unsigned i = numCombinedDims; i < rank; ++i) {
+    subviewOffsets.push_back(rewriter.getI64IntegerAttr(0));
+    if (ShapedType::isDynamic(originalShape[i])) {
+      subviewSizes.push_back(meta.getSizes()[i]);
+      resultShape.push_back(ShapedType::kDynamic);
+    } else {
+      subviewSizes.push_back(rewriter.getI64IntegerAttr(originalShape[i]));
+      resultShape.push_back(originalShape[i]);
+    }
+    subviewStrides.push_back(rewriter.getI64IntegerAttr(1));
+  }
+
+  auto resultType = memref::SubViewOp::inferRankReducedResultType(
+      resultShape, memrefType, subviewOffsets, subviewSizes, subviewStrides);
+  auto subviewOp =
+      memref::SubViewOp::create(rewriter, loc, resultType, memref,
+                                subviewOffsets, subviewSizes, subviewStrides);
+
+  // Return the remaining offsets for the last targetRank dimensions
+  SmallVector<OpFoldResult> newOffsets(offsets.begin() + numCombinedDims,
+                                       offsets.end());
+  return {subviewOp.getResult(), newOffsets};
+}
+
 template <
     typename OpType,
     typename = std::enable_if_t<llvm::is_one_of<
@@ -523,18 +545,19 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
         descShape, elementType, /*array_length=*/1,
         /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);
 
-    xegpu::CreateNdDescOp ndDesc =
-        createNdDescriptor(rewriter, loc, descType,
-                           dyn_cast<TypedValue<MemRefType>>(readOp.getBase()),
-                           readOp.getIndices());
-
     DenseI64ArrayAttr transposeAttr =
         !isTransposeLoad ? nullptr
                          : DenseI64ArrayAttr::get(rewriter.getContext(),
                                                   ArrayRef<int64_t>{1, 0});
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, readOp.getBase(), getAsOpFoldResult(readOp.getIndices()),
+        vecTy.getRank());
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc,
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
+    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
                                           /*packed=*/nullptr, transposeAttr,
                                           /*l1_hint=*/hint,
                                           /*l2_hint=*/hint, /*l3_hint=*/hint);
@@ -575,21 +598,23 @@ struct TransferWriteLowering
     if (!map.isMinorIdentity())
       return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
 
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, writeOp.getBase(),
+        getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank());
+
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(),
         /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(),
         xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc =
-        createNdDescriptor(rewriter, loc, descType,
-                           dyn_cast<TypedValue<MemRefType>>(writeOp.getBase()),
-                           writeOp.getIndices());
-
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto storeOp =
-        xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), ndDesc,
-                                 /*l1_hint=*/hint,
-                                 /*l2_hint=*/hint, /*l3_hint=*/hint);
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
+    auto storeOp = xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(),
+                                            ndDesc, indices,
+                                            /*l1_hint=*/hint,
+                                            /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(writeOp, storeOp);
 
     return success();
@@ -674,19 +699,24 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
 
     // Boundary check is available only for block instructions.
     bool boundaryCheck = vecTy.getRank() > 1;
+    // By default, no specific caching policy is assigned.
+    xegpu::CachePolicyAttr hint = nullptr;
+
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, loadOp.getBase(), getAsOpFoldResult(loadOp.getIndices()),
+        vecTy.getRank());
 
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1,
         boundaryCheck, xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
-        rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices());
 
-    // By default, no specific caching policy is assigned.
-    xegpu::CachePolicyAttr hint = nullptr;
-    auto loadNdOp = xegpu::LoadNdOp::create(
-        rewriter, loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,
-        /*l1_hint=*/hint,
-        /*l2_hint=*/hint, /*l3_hint=*/hint);
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+    auto loadNdOp =
+        xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
+                                /*packed=*/nullptr, /*transpose=*/nullptr,
+                                /*l1_hint=*/hint,
+                                /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(loadOp, loadNdOp);
 
     return success();
@@ -708,18 +738,24 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
     // Boundary check is available only for block instructions.
     bool boundaryCheck = vecTy.getRank() > 1;
 
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, storeOp.getBase(),
+        getAsOpFoldResult(storeOp.getIndices()), vecTy.getRank());
+
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(),
         /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
-        rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices());
 
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
     auto storeNdOp =
-        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc,
+        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, indices,
                                  /*l1_hint=*/hint,
                                  /*l2_hint=*/hint, /*l3_hint=*/hint);
+
     rewriter.replaceOp(storeOp, storeNdOp);
 
     return success();
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 7b6c4b6c2c813..c8f5c86c03686 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -280,8 +280,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
     auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
 
     // if shape and strides are from Memref, we don't need attributes for them
-    // to keep the IR print clean.
-    if (staticShape == memrefShape && staticStrides == memrefStrides) {
+    // to keep the IR print clean (only do so for full-static case, otherwise
+    // printer would fail trying to print empty array-attr).
+    if (staticShape == memrefShape && staticStrides == memrefStrides &&
+        dynamicShape.empty() && dynamicStrides.empty()) {
       staticShapeAttr = DenseI64ArrayAttr();
       staticStridesAttr = DenseI64ArrayAttr();
     }
@@ -342,8 +344,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
     auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
 
     // if shape and strides are from Memref, we don't need attributes for them
-    // to keep the IR print clean.
-    if (staticShape == memrefShape && staticStrides == memrefStrides) {
+    // to keep the IR print clean (only do so for full-static case, otherwise
+    // printer would fail trying to print empty array-attr).
+    if (staticShape == memrefShape && staticStrides == memrefStrides &&
+        dynamicShape.empty() && dynamicStrides.empty()) {
       staticShapeAttr = DenseI64ArrayAttr();
       staticStridesAttr = DenseI64ArrayAttr();
     }
diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
index 9908205f07c92..ae5141db16c09 100644
--- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
@@ -9,11 +9,12 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto
 // CHECK-LABEL: @load_1D_vector(
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // CHECK-SAME:    boundary_check = false
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
@@ -28,35 +29,29 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // CHECK-LABEL: @load_2D_vector(
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
 
 func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
-    %offset: index) -> vector<8x16xf32> {
-  %0 = vector.load %source[%offset, %offset, %offset]
+    %i: index, %j: index, %k: index) -> vector<8x16xf32> {
+  %0 = vector.load %source[%i, %j, %k]
     : memref<?x?x?xf32>, vector<8x16xf32>
   return %0 : vector<8x16xf32>
 }
 
 // CHECK-LABEL: @load_dynamic_source(
 // CHECK-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// CHECK-SAME:  %[[OFFSET:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// CHECK-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// CHECK:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
@@ -72,9 +67,9 @@ func.func @load_out_of_bounds(%source: memref<7x15xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<7x15xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// CHECK-SAME:    %[[SRC]]
 // CHECK-SAME:    memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
index 2c498dcc2a071..1a10d917623cc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
@@ -11,11 +11,12 @@ func.func @store_1D_vector(%vec: vector<8xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<8xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // CHECK-SAME:    boundary_check = false
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32>
 
 // -----
 
@@ -30,16 +31,17 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // -----
 
 func.func @store_dynamic_source(%vec: vector<8x16xf32>,
-    %source: memref<?x?x?xf32>, %offset: index) {
-  vector.store %vec, %source[%offset, %offset, %offset]
+    %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) {
+  vector.store %vec, %source[%i, %j, %k]
     : memref<?x?x?xf32>, vector<8x16xf32>
   return
 }
@@ -47,18 +49,11 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-LABEL: @store_dynamic_source(
 // CHECK-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// CHECK-SAME:  %[[OFFSET:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// CHECK-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// CHECK:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32>
 
 // -----
 
@@ -74,9 +69,9 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<7x64xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// CHECK-SAME:    %[[SRC]]
 // CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // -----
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c4ca79af1bd9a..c87a5304babfe 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -12,11 +12,12 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector
 // LOAD-ND-LABEL:  @load_1D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// LOAD-ND-SAME:     memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_1D_vector(
@@ -46,11 +47,12 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // LOAD-ND-LABEL:  @load_2D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// LOAD-ND-SAME:     memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_2D_vector(
@@ -83,9 +85,9 @@ gpu.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>,
 // LOAD-ND-LABEL:  @load_zero_pad_out_of_bounds(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_zero_pad_out_of_bounds(
@@ -109,9 +111,9 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[OFFSET1:.+]]: index, 
 // LOAD-ND-SAME:   %[[OFFSET2:.+]]: index  
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET1]], %[[OFFSET2]]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]] <{transpose = array<i64: 1, 0>}>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] <{transpose = array<i64: 1, 0>}>
 // LOAD-ND-SAME:     -> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
@@ -143,16 +145,11 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 }
 // LOAD-ND-LABEL:  @load_dynamic_source(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<?x?x?xf32>,
-// LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[C2:.+]] = arith.constant 2 : index
-// LOAD-ND:        %[[C1:.+]] = arith.constant 1 : index
-// LOAD-ND:        %[[C0:.+]] = arith.constant 0 : index
-// LOAD-ND-DAG:    %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// LOAD-ND-DAG:    %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// LOAD-ND-DAG:    %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// LOAD-ND:        %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET:.+]], %[[OFFSET:.+]], %[[OFFSET:.+]]]
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND-SAME:   %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// LOAD-ND:        {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 
@@ -184,10 +181,11 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
 }
 
 // LOAD-ND-LABEL:  @load_dynamic_source2(
-// LOAD-ND-DAG:    %[[C0:.+]] = arith.constant 0 : index
-// LOAD-ND-DAG:    %[[DIM:.+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x8x16xf32>
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], shape : [%[[DIM]], 8, 16], strides : [128, 16, 1] : memref<?x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32>
+// LOAD-ND-SAME:   %[[SRC:.+]]: memref<?x8x16xf32>,
+// LOAD-ND-SAME:   %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32, strided<[16, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]] : vector<8x16xf32>
 
 // LOAD-GATHER-LABEL:  @load_dynamic_source2(
@@ -418,11 +416,12 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
 // LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
-// LOAD-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf16>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_from_subview(
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index fcfc9414da4f6..43a1a7206e2cc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --xevm-attach-target='module=xevm.* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND
+// RUN: mlir-opt %s --xevm-attach-target='module=xevm_* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND
 // RUN: mlir-opt %s -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-SCATTER
 
 
@@ -15,11 +15,12 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>,
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// STORE-ND-SAME:    %[[COLLAPSED]]
+// STORE-ND-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // STORE-ND-SAME:    boundary_check = false
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32>
 
 // STORE-SCATTER-LABEL:  @store_1D_vector(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf32>,
@@ -49,11 +50,12 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// STORE-ND-SAME:    %[[COLLAPSED]]
+// STORE-ND-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32,
 // STORE-ND-SAME:    boundary_check = false
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL:  @store_2D_vector(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8x16xf32>,
@@ -73,8 +75,8 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // -----
 gpu.module @xevm_module {
 gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
-    %source: memref<?x?x?xf32>, %offset: index) {
-  vector.transfer_write %vec, %source[%offset, %offset, %offset]
+    %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) {
+  vector.transfer_write %vec, %source[%i, %j, %k]
     {in_bounds = [true, true]}
     : vector<8x16xf32>, memref<?x?x?xf32>
   gpu.return
@@ -83,18 +85,11 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // STORE-ND-LABEL: @store_dynamic_source(
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// STORE-ND-SAME:  %[[OFFSET:.+]]: index
-// STORE-ND-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// STORE-ND-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// STORE-ND-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// STORE-ND-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// STORE-ND-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// STORE-ND-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// STORE-ND:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// STORE-ND-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// STORE-ND:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL: @store_dynamic_source(
 // STORE-SCATTER-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
@@ -126,9 +121,9 @@ gpu.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<7x64xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// STORE-ND-SAME:    %[[SRC]]
 // STORE-ND-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL:  @store_out_of_bounds(
 // STORE-SCATTER:   vector.transfer_write
@@ -298,13 +293,13 @@ gpu.func @store_to_subview(%vec: vector<8xf16>,
 // STORE-ND-SAME:   %[[VEC:.+]]: vector<8xf16>,
 // STORE-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // STORE-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
-// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1]
-// STORE-ND-SAME:     : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
 // STORE-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
-// STORE-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// STORE-ND-SAME:     %[[COLLAPSED]]
+// STORE-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
 // STORE-ND-SAME:     boundary_check = false
-// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf16>
+// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16>
 
 // STORE-SCATTER-LABEL:  @store_to_subview(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf16>,

From 8a84b285f67cb778493e225dc9699d902921e7b0 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 4 Nov 2025 20:55:33 +0800
Subject: [PATCH 160/313] [SimplifyCFG] Eliminate dead edges of switches
 according to the domain of conditions (#165748)

In simplifycfg/cvp/sccp, we eliminate dead edges of switches according
to the knownbits/range info of conditions. However, these approximations
may not meet the real-world needs when the domain of condition values is
sparse. For example, if the condition can only be either -3 or 3, we
cannot prove that the condition never evaluates to 1 (knownbits:
???????1, range: [-3, 4)).
This patch adds a helper function `collectPossibleValues` to enumerate
all the possible values of V. To fix the motivating issue,
`eliminateDeadSwitchCases` will use the result to remove dead edges.

Note: In
https://discourse.llvm.org/t/missed-optimization-due-to-overflow-check/88700
I proposed a new value lattice kind to represent such values. But I find
it hard to apply because the transition becomes much complicated.

Compile-time impact looks neutral:
https://llvm-compile-time-tracker.com/compare.php?from=32d6b2139a6c8f79e074e8c6cfe0cc9e79c4c0c8&to=e47c26e3f1bf9eb062684dda4fafce58438e994b&stat=instructions:u
This patch removes many dead error-handling codes:
https://github.com/dtcxzyw/llvm-opt-benchmark/pull/3012
Closes https://github.com/llvm/llvm-project/issues/165179.
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  10 ++
 llvm/lib/Analysis/ValueTracking.cpp           |  52 ++++++++
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  67 ++++++----
 llvm/test/CodeGen/AArch64/arm64-ccmp.ll       |   4 +-
 llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll   |   6 +-
 ...-on-const-select.ll => switch-on-const.ll} | 126 ++++++++++++++++++
 .../Transforms/SimplifyCFG/switch_mask.ll     |   1 +
 .../Transforms/SimplifyCFG/switch_undef.ll    |   7 +-
 8 files changed, 237 insertions(+), 36 deletions(-)
 rename llvm/test/Transforms/SimplifyCFG/{switch-on-const-select.ll => switch-on-const.ll} (54%)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index af218ba564081..093309cb8bbee 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -1024,6 +1024,16 @@ findValuesAffectedByCondition(Value *Cond, bool IsAssume,
 LLVM_ABI Value *stripNullTest(Value *V);
 LLVM_ABI const Value *stripNullTest(const Value *V);
 
+/// Enumerates all possible values of V and inserts them into the set \p
+/// Constants. If \p AllowUndefOrPoison is false, it fails when V may contain
+/// undef/poison elements. Returns true if the result is complete. Otherwise,
+/// the result is incomplete (more than MaxCount values).
+/// NOTE: The constant values are not distinct.
+LLVM_ABI bool
+collectPossibleValues(const Value *V,
+                      SmallPtrSetImpl<const Constant *> &Constants,
+                      unsigned MaxCount, bool AllowUndefOrPoison = true);
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 523374bdc472f..789a98366cead 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -10341,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) {
 Value *llvm::stripNullTest(Value *V) {
   return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V)));
 }
+
+bool llvm::collectPossibleValues(const Value *V,
+                                 SmallPtrSetImpl<const Constant *> &Constants,
+                                 unsigned MaxCount, bool AllowUndefOrPoison) {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> Worklist;
+  auto Push = [&](const Value *V) -> bool {
+    if (auto *C = dyn_cast<Constant>(V)) {
+      if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C))
+        return false;
+      // Check existence first to avoid unnecessary allocations.
+      if (Constants.contains(C))
+        return true;
+      if (Constants.size() == MaxCount)
+        return false;
+      Constants.insert(C);
+      return true;
+    }
+
+    if (auto *Inst = dyn_cast<Instruction>(V)) {
+      if (Visited.insert(Inst).second)
+        Worklist.push_back(Inst);
+      return true;
+    }
+    return false;
+  };
+  if (!Push(V))
+    return false;
+  while (!Worklist.empty()) {
+    const Instruction *CurInst = Worklist.pop_back_val();
+    switch (CurInst->getOpcode()) {
+    case Instruction::Select:
+      if (!Push(CurInst->getOperand(1)))
+        return false;
+      if (!Push(CurInst->getOperand(2)))
+        return false;
+      break;
+    case Instruction::PHI:
+      for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) {
+        // Fast path for recurrence PHI.
+        if (IncomingValue == CurInst)
+          continue;
+        if (!Push(IncomingValue))
+          return false;
+      }
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index bb733277e0fad..532511dcf91b0 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6020,6 +6020,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   KnownBits Known = computeKnownBits(Cond, DL, AC, SI);
+  SmallPtrSet<const Constant *, 4> KnownValues;
+  bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4);
 
   // We can also eliminate cases by determining that their values are outside of
   // the limited range of the condition based on how many significant (non-sign)
@@ -6039,15 +6041,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
         UniqueSuccessors.push_back(Successor);
       ++It->second;
     }
-    const APInt &CaseVal = Case.getCaseValue()->getValue();
+    ConstantInt *CaseC = Case.getCaseValue();
+    const APInt &CaseVal = CaseC->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
-        (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) {
-      DeadCases.push_back(Case.getCaseValue());
+        (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) ||
+        (IsKnownValuesValid && !KnownValues.contains(CaseC))) {
+      DeadCases.push_back(CaseC);
       if (DTU)
         --NumPerSuccessorCases[Successor];
       LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
                         << " is dead.\n");
-    }
+    } else if (IsKnownValuesValid)
+      KnownValues.erase(CaseC);
   }
 
   // If we can prove that the cases must cover all possible values, the
@@ -6058,33 +6063,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   const unsigned NumUnknownBits =
       Known.getBitWidth() - (Known.Zero | Known.One).popcount();
   assert(NumUnknownBits <= Known.getBitWidth());
-  if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */) {
-    uint64_t AllNumCases = 1ULL << NumUnknownBits;
-    if (SI->getNumCases() == AllNumCases) {
+  if (HasDefault && DeadCases.empty()) {
+    if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) {
       createUnreachableSwitchDefault(SI, DTU);
       return true;
     }
-    // When only one case value is missing, replace default with that case.
-    // Eliminating the default branch will provide more opportunities for
-    // optimization, such as lookup tables.
-    if (SI->getNumCases() == AllNumCases - 1) {
-      assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
-      IntegerType *CondTy = cast<IntegerType>(Cond->getType());
-      if (CondTy->getIntegerBitWidth() > 64 ||
-          !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
-        return false;
 
-      uint64_t MissingCaseVal = 0;
-      for (const auto &Case : SI->cases())
-        MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
-      auto *MissingCase =
-          cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
-      SwitchInstProfUpdateWrapper SIW(*SI);
-      SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
-      createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
-      SIW.setSuccessorWeight(0, 0);
-      return true;
+    if (NumUnknownBits < 64 /* avoid overflow */) {
+      uint64_t AllNumCases = 1ULL << NumUnknownBits;
+      if (SI->getNumCases() == AllNumCases) {
+        createUnreachableSwitchDefault(SI, DTU);
+        return true;
+      }
+      // When only one case value is missing, replace default with that case.
+      // Eliminating the default branch will provide more opportunities for
+      // optimization, such as lookup tables.
+      if (SI->getNumCases() == AllNumCases - 1) {
+        assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+        IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+        if (CondTy->getIntegerBitWidth() > 64 ||
+            !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+          return false;
+
+        uint64_t MissingCaseVal = 0;
+        for (const auto &Case : SI->cases())
+          MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+        auto *MissingCase = cast<ConstantInt>(
+            ConstantInt::get(Cond->getType(), MissingCaseVal));
+        SwitchInstProfUpdateWrapper SIW(*SI);
+        SIW.addCase(MissingCase, SI->getDefaultDest(),
+                    SIW.getSuccessorWeight(0));
+        createUnreachableSwitchDefault(SI, DTU,
+                                       /*RemoveOrigDefaultBlock*/ false);
+        SIW.setSuccessorWeight(0, 0);
+        return true;
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index cad5df0d9655e..68ab8902767b3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -430,12 +430,12 @@ declare i32 @foo()
 
 ; Test case distilled from 126.gcc.
 ; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
-define void @build_modify_expr() nounwind ssp {
+define void @build_modify_expr(i32 %cond) nounwind ssp {
 ; CHECK-LABEL: build_modify_expr:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ret
 entry:
-  switch i32 undef, label %sw.bb.i.i [
+  switch i32 %cond, label %sw.bb.i.i [
     i32 69, label %if.end85
     i32 70, label %if.end85
     i32 71, label %if.end85
diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
index 559bb68741e12..930cf8152b756 100644
--- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
@@ -6,11 +6,11 @@
 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 target triple = "hexagon"
 
-define i32 @fred(ptr %a0) #0 {
+define i32 @fred(ptr %a0, i32 %cond) #0 {
 ; CHECK-LABEL: fred:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) jump:nt .LBB0_2
+; CHECK-NEXT:     p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:  // %bb.1: // %b2
 ; CHECK-NEXT:    {
@@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 {
 ; CHECK-NEXT:     jumpr r31
 ; CHECK-NEXT:    }
 b0:
-  switch i32 undef, label %b14 [
+  switch i32 %cond, label %b14 [
     i32 5, label %b2
     i32 3, label %b1
   ]
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
similarity index 54%
rename from llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll
rename to llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
index e8b58639c13dd..1ab1b5e8bd838 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
@@ -154,6 +154,132 @@ bees:
   unreachable
 }
 
+define void @pr165179(i1 %cond) {
+; CHECK-LABEL: @pr165179(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB]]
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  tail call void @bees.a() nounwind
+  br label %switchbb
+
+if.else:
+  tail call void @bees.b() nounwind
+  br label %switchbb
+
+switchbb:
+  %cond1 = phi i32 [ 1, %if.else ], [ -1, %if.then ]
+  switch i32 %cond1, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
+define void @switch_remove_dead_case_phi(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @switch_remove_dead_case_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[SWITCHBB:%.*]], label [[IF_ELSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ -1, [[IF_THEN]] ]
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB]]
+; CHECK:       switchbb:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[PHI]], [[IF_ELSE]] ], [ 5, [[IF_THEN]] ]
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i32 [[COND]], -1
+; CHECK-NEXT:    br i1 [[COND3]], label [[EXIT:%.*]], label [[DEFAULT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       default:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %if.then, label %if.else
+
+if.then:
+  tail call void @bees.a() nounwind
+  br i1 %cond2, label %switchbb, label %if.else
+
+if.else:
+  %phi = phi i32 [ 3, %entry ], [ -1, %if.then ]
+  tail call void @bees.b() nounwind
+  br label %switchbb
+
+switchbb:
+  %cond = phi i32 [ %phi, %if.else ], [ 5, %if.then ]
+  switch i32 %cond, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
+define void @switch_remove_dead_case_select(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @switch_remove_dead_case_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = select i1 [[COND1:%.*]], i32 -1, i32 3
+; CHECK-NEXT:    [[Y:%.*]] = select i1 [[COND2:%.*]], i32 [[X]], i32 5
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[Y]], -1
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[DEFAULT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       default:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %x = select i1 %cond1, i32 -1, i32 3
+  %y = select i1 %cond2, i32 %x, i32 5
+  switch i32 %y, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
 declare void @llvm.trap() nounwind noreturn
 declare void @bees.a() nounwind
 declare void @bees.b() nounwind
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
index f8bcbc057a7ae..428c18fc18e3d 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
@@ -221,6 +221,7 @@ define i1 @pr88607() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 false, i32 4, i32 1
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 false, i32 2, i32 [[COND]]
+; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i32 [[SPEC_SELECT]], 1
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
index 88a729b7d941a..4de5ea948ed27 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
@@ -5,12 +5,11 @@
 define void @f6() #0 {
 ; CHECK-LABEL: @f6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
-; CHECK:       for.cond.i:
+; CHECK-NEXT:    br label [[F1_EXIT_I:%.*]]
+; CHECK:       f1.exit.i:
 ; CHECK-NEXT:    [[TOBOOL7_I:%.*]] = icmp ne i16 1, 0
-; CHECK-NEXT:    br label [[FOR_COND_I]]
+; CHECK-NEXT:    br label [[F1_EXIT_I]]
 ;
-
 entry:
   br label %for.cond.i
 

From f771f1ee2697ee4c4d86de060f6ec8c7b4041b84 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 4 Nov 2025 13:11:52 +0000
Subject: [PATCH 161/313] [llvm-dwarfdump][CMake] Link against BinaryFormat
 (#166364)

In https://github.com/llvm/llvm-project/pull/165720 we started using a
DWARF API (`llvm::dwarf::getTag`) from `BinaryFormat`. This patch makes
dwarfdump link against the necessary LLVM component.

This fixes following linker error that started occurring on some of the
bots:
```
[7758/8172] Linking CXX executable bin/llvm-dwarfdump
FAILED: bin/llvm-dwarfdump
: && /usr/bin/c++ -fPIC -fno-semantic-interposition -fvisibility-inlines-hidden -Werror=date-time -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wno-missing-field-initializers -pedantic -Wno-long-long -Wimplicit-fallthrough -Wno-uninitialized -Wno-nonnull -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-array-bounds -Wno-stringop-overread -Wno-noexcept-type -Wdelete-non-virtual-dtor -Wsuggest-override -Wno-comment -Wno-misleading-indentation -Wctad-maybe-unsupported -fdiagnostics-color -ffunction-sections -fdata-sections -O3 -DNDEBUG -Wl,-rpath-link,/home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/./lib  -Wl,--gc-sections tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/SectionSizes.cpp.o tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/Statistics.cpp.o tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/llvm-dwarfdump.cpp.o -o bin/llvm-dwarfdump  -Wl,-rpath,"\$ORIGIN/../lib:/home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/lib:"  lib/libLLVMAMDGPUDesc.so.22.0git  lib/libLLVMSPIRVDesc.so.22.0git  lib/libLLVMX86Desc.so.22.0git  lib/libLLVMAMDGPUInfo.so.22.0git  lib/libLLVMSPIRVInfo.so.22.0git  lib/libLLVMX86Info.so.22.0git  lib/libLLVMDebugInfoDWARF.so.22.0git  lib/libLLVMObject.so.22.0git  lib/libLLVMMC.so.22.0git  lib/libLLVMDebugInfoDWARFLowLevel.so.22.0git  lib/libLLVMTargetParser.so.22.0git  lib/libLLVMSupport.so.22.0git  -Wl,-rpath-link,/home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/lib && :
/usr/bin/ld: tools/llvm-dwarfdump/CMakeFiles/llvm-dwarfdump.dir/llvm-dwarfdump.cpp.o: undefined reference to symbol '_ZN4llvm5dwarf6getTagENS_9StringRefE'
/usr/bin/ld: /home/botworker/bbot/amdgpu-offload-ubuntu-22-cmake-build-only/build/./lib/libLLVMBinaryFormat.so.22.0git: error adding symbols: DSO missing from command line
collect2: error: ld returned 1 exit status

```
---
 llvm/tools/llvm-dwarfdump/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/tools/llvm-dwarfdump/CMakeLists.txt b/llvm/tools/llvm-dwarfdump/CMakeLists.txt
index aeb1b8f14d830..7a0adf32e938c 100644
--- a/llvm/tools/llvm-dwarfdump/CMakeLists.txt
+++ b/llvm/tools/llvm-dwarfdump/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  BinaryFormat
   DebugInfoDWARF
   DebugInfoDWARFLowLevel
   AllTargetsDescs

From 2e89b71906307d7394be503e07939a42d4449d51 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 4 Nov 2025 08:17:50 -0500
Subject: [PATCH 162/313] [C23] Correctly handle missing embed with -MG
 (#166188)

-MG is supposed to suppress "file not found" diagnostics and instead
treat those as generated files for purposes of dependency scanning.
Clang was previously emitting the diagnostic instead of emitting the
name of the embedded file.

Fixes #165632
---
 clang/docs/ReleaseNotes.rst           |  2 ++
 clang/lib/Frontend/DependencyFile.cpp | 11 +++++++++++
 clang/lib/Lex/PPDirectives.cpp        |  2 +-
 clang/test/Driver/mg.c                |  4 +++-
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2e2c5198fa8f5..3f57ddc92d5e8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -209,6 +209,8 @@ C23 Feature Support
   `WG14 N2710 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2710.htm>`_.
 - Fixed accepting as compatible unnamed tag types with the same fields within
   the same translation unit but from different types.
+- ``-MG`` now silences the "file not found" errors with ``#embed`` when
+  scanning for dependencies and encountering an unknown file. #GH165632
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 15fa7de35df97..93e012b163878 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -75,6 +75,17 @@ struct DepCollectorPPCallbacks : public PPCallbacks {
                                     /*IsMissing*/ false);
   }
 
+  bool EmbedFileNotFound(StringRef FileName) override {
+    DepCollector.maybeAddDependency(
+        llvm::sys::path::remove_leading_dotslash(FileName),
+        /*FromModule=*/false,
+        /*IsSystem=*/false,
+        /*IsModuleFile=*/false,
+        /*IsMissing=*/true);
+    // Return true to silence the file not found diagnostic.
+    return true;
+  }
+
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 6a5e5d4bad3a6..891c8ab7f3155 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -4018,7 +4018,7 @@ void Preprocessor::HandleEmbedDirective(SourceLocation HashLoc, Token &EmbedTok,
       this->LookupEmbedFile(Filename, isAngled, true, LookupFromFile);
   if (!MaybeFileRef) {
     // could not find file
-    if (Callbacks && Callbacks->EmbedFileNotFound(OriginalFilename)) {
+    if (Callbacks && Callbacks->EmbedFileNotFound(Filename)) {
       return;
     }
     Diag(FilenameTok, diag::err_pp_file_not_found) << Filename;
diff --git a/clang/test/Driver/mg.c b/clang/test/Driver/mg.c
index 82d8a6084e5e0..b7458a08698d3 100644
--- a/clang/test/Driver/mg.c
+++ b/clang/test/Driver/mg.c
@@ -1,5 +1,7 @@
-// RUN: %clang -M -MG -include nonexistent-preinclude.h %s | FileCheck %s
+// RUN: %clang -M -MG -include nonexistent-preinclude.h -std=c23 %s | FileCheck %s
 // CHECK: nonexistent-preinclude.h
 // CHECK: nonexistent-ppinclude.h
+// CHECK: nonexistent-embed
 
 #include "nonexistent-ppinclude.h"
+#embed "nonexistent-embed"

From 89c26170394824c3b636dad0b799256848179fb4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 4 Nov 2025 13:20:26 +0000
Subject: [PATCH 163/313] [X86] bittest-big-integer.ll - add test showing
 multiple uses of the RMW store chain AND its stored value (#166366)

---
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 263 +++++++++++++++++++
 1 file changed, 263 insertions(+)

diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 040ae65a33251..bcb14fd25b975 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1083,6 +1083,269 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
   ret i32 %ret
 }
 
+; Multiple uses of the store chain AND stored value
+define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
+; X86-LABEL: chain_reset_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $112, %esp
+; X86-NEXT:    movzbl 20(%ebp), %ecx
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $28, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 72(%esp,%eax), %edx
+; X86-NEXT:    movl 76(%esp,%eax), %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %edx
+; X86-NEXT:    movl 84(%esp,%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %edx
+; X86-NEXT:    movl 64(%esp,%eax), %edi
+; X86-NEXT:    movl 88(%esp,%eax), %esi
+; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl %cl, %ebx, %esi
+; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shll %cl, %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    notl %eax
+; X86-NEXT:    notl %edx
+; X86-NEXT:    notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    notl %edi
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    notl %esi
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    andl 12(%ecx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 8(%ecx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    andl 20(%ecx), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl 16(%ecx), %edx
+; X86-NEXT:    andl 28(%ecx), %eax
+; X86-NEXT:    andl 24(%ecx), %ebx
+; X86-NEXT:    andl 4(%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    andl (%ecx), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, 24(%ecx)
+; X86-NEXT:    movl %eax, 28(%ecx)
+; X86-NEXT:    movl %edx, 16(%ecx)
+; X86-NEXT:    movl %edi, 20(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, 12(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    jne .LBB23_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:  .LBB23_2:
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; SSE-LABEL: chain_reset_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %ecx, %eax
+; SSE-NEXT:    shrb $3, %al
+; SSE-NEXT:    andb $24, %al
+; SSE-NEXT:    negb %al
+; SSE-NEXT:    movsbq %al, %r10
+; SSE-NEXT:    movq -24(%rsp,%r10), %r8
+; SSE-NEXT:    movq -16(%rsp,%r10), %rax
+; SSE-NEXT:    shldq %cl, %r8, %rax
+; SSE-NEXT:    movq -32(%rsp,%r10), %r9
+; SSE-NEXT:    shldq %cl, %r9, %r8
+; SSE-NEXT:    movq -40(%rsp,%r10), %r10
+; SSE-NEXT:    shldq %cl, %r10, %r9
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shlq %cl, %r10
+; SSE-NEXT:    notq %r8
+; SSE-NEXT:    notq %rax
+; SSE-NEXT:    notq %r10
+; SSE-NEXT:    notq %r9
+; SSE-NEXT:    andq 24(%rdi), %rax
+; SSE-NEXT:    andq 16(%rdi), %r8
+; SSE-NEXT:    andq 8(%rdi), %r9
+; SSE-NEXT:    andq (%rdi), %r10
+; SSE-NEXT:    movq %r8, 16(%rdi)
+; SSE-NEXT:    movq %rax, 24(%rdi)
+; SSE-NEXT:    movq %r10, (%rdi)
+; SSE-NEXT:    movq %r9, 8(%rdi)
+; SSE-NEXT:    orq %rax, %r9
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    movl (%rsi), %eax
+; SSE-NEXT:    movl %r10d, (%rsi)
+; SSE-NEXT:    movl (%rdx), %ecx
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: chain_reset_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    shrb $3, %al
+; AVX2-NEXT:    andb $24, %al
+; AVX2-NEXT:    negb %al
+; AVX2-NEXT:    movsbq %al, %rax
+; AVX2-NEXT:    movq -32(%rsp,%rax), %r8
+; AVX2-NEXT:    movq -24(%rsp,%rax), %r9
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    shldq %cl, %r8, %r10
+; AVX2-NEXT:    movq -40(%rsp,%rax), %r11
+; AVX2-NEXT:    movq -16(%rsp,%rax), %rax
+; AVX2-NEXT:    shldq %cl, %r9, %rax
+; AVX2-NEXT:    shldq %cl, %r11, %r8
+; AVX2-NEXT:    andnq 24(%rdi), %rax, %rax
+; AVX2-NEXT:    andnq 16(%rdi), %r10, %r9
+; AVX2-NEXT:    andnq 8(%rdi), %r8, %r8
+; AVX2-NEXT:    shlxq %rcx, %r11, %rcx
+; AVX2-NEXT:    andnq (%rdi), %rcx, %rcx
+; AVX2-NEXT:    movq %r9, 16(%rdi)
+; AVX2-NEXT:    movq %rax, 24(%rdi)
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    movq %r8, 8(%rdi)
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    orq %rcx, %r9
+; AVX2-NEXT:    movl (%rsi), %eax
+; AVX2-NEXT:    movl %ecx, (%rsi)
+; AVX2-NEXT:    movl (%rdx), %ecx
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    orq %r8, %r9
+; AVX2-NEXT:    cmovnel %ecx, %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: chain_reset_i256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX512-NEXT:    movl %ecx, %eax
+; AVX512-NEXT:    shrb $3, %al
+; AVX512-NEXT:    andb $24, %al
+; AVX512-NEXT:    negb %al
+; AVX512-NEXT:    movsbq %al, %rax
+; AVX512-NEXT:    movq -40(%rsp,%rax), %r8
+; AVX512-NEXT:    movq -32(%rsp,%rax), %r9
+; AVX512-NEXT:    movq -24(%rsp,%rax), %r10
+; AVX512-NEXT:    movq %r10, %r11
+; AVX512-NEXT:    shldq %cl, %r9, %r11
+; AVX512-NEXT:    movq -16(%rsp,%rax), %rax
+; AVX512-NEXT:    shldq %cl, %r10, %rax
+; AVX512-NEXT:    shlxq %rcx, %r8, %r10
+; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT:    shldq %cl, %r8, %r9
+; AVX512-NEXT:    andnq 24(%rdi), %rax, %rax
+; AVX512-NEXT:    andnq 16(%rdi), %r11, %rcx
+; AVX512-NEXT:    andnq 8(%rdi), %r9, %r8
+; AVX512-NEXT:    andnq (%rdi), %r10, %r9
+; AVX512-NEXT:    movq %rcx, 16(%rdi)
+; AVX512-NEXT:    movq %rax, 24(%rdi)
+; AVX512-NEXT:    movq %r9, (%rdi)
+; AVX512-NEXT:    movq %r8, 8(%rdi)
+; AVX512-NEXT:    orq %rax, %r8
+; AVX512-NEXT:    orq %r9, %rcx
+; AVX512-NEXT:    movl (%rsi), %eax
+; AVX512-NEXT:    movl %r9d, (%rsi)
+; AVX512-NEXT:    movl (%rdx), %edx
+; AVX512-NEXT:    addl %edx, %eax
+; AVX512-NEXT:    orq %r8, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %rem = and i32 %position, 255
+  %ofs = zext nneg i32 %rem to i256
+  %bit = shl nuw i256 1, %ofs
+  %ld0 = load i256, ptr %p0
+  %msk = xor i256 %bit, -1
+  %res = and i256 %ld0, %msk
+  store i256 %res, ptr %p0
+  %cmp = icmp ne i256 %res, 0
+  %ld1 = load i32, ptr %p1
+  %trunc = trunc i256 %res to i32
+  store i32 %trunc, ptr %p1
+  %ld2 = load i32, ptr %p2
+  %add = add i32 %ld1, %ld2
+  %sel = select i1 %cmp, i32 %ld2, i32 %add
+  ret i32 %sel
+}
+
 ; BTC/BT/BTS sequence on same i128
 define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
 ; X86-LABEL: sequence_i128:

From 12f392cff10fcc70b4ec4f01ab386922742e9136 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 4 Nov 2025 13:39:20 +0000
Subject: [PATCH 164/313] [AArch64][SME] Support `aarch64-split-sve-objects`
 with VLAs/realignment (#163816)

This was left out of the original patch (#142392) to simplify the
initial implementation. However, after refactoring the SVE
prologue/epilogue code in #162253, it's not much of an extension to
support this case.

The main change here is when restoring the SP from the FP for the SVE
restores, we may need an additional frame offset to move from the start
of the ZPR callee-saves to the start of the PPR callee-saves.

This patch also fixes a previously latent bug where we'd add the
`RealignmentPadding` when allocating the PPR locals, then again for the
ZPR locals. This was unnecessary as the stack only needs to be realigned
after all SVE allocations.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |    8 +-
 .../AArch64/AArch64PrologueEpilogue.cpp       |   63 +-
 .../Target/AArch64/AArch64PrologueEpilogue.h  |    4 +
 .../CodeGen/AArch64/framelayout-split-sve.mir |   49 +-
 .../AArch64/split-sve-stack-frame-layout.ll   |  338 ++-
 llvm/test/CodeGen/AArch64/stack-hazard.ll     | 1828 ++++++++++-------
 6 files changed, 1491 insertions(+), 799 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0f7b34c36055f..3ee4d58ca892c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
       return;
     }
 
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-    if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
-      LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
-                           "sized objects or realignment\n");
-      return;
-    }
-
     // If another calling convention is explicitly set FPRs can't be promoted to
     // ZPR callee-saves.
     if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
            "Expected SVE to be available for PPRs");
 
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     // With SplitSVEObjects the CS hazard padding is placed between the
     // PPRs and ZPRs. If there are any FPR CS there would be a hazard between
     // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 45b7120112af2..4df4d54e60c95 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() {
     CFAOffset += SVEAllocs.BeforePPRs;
     assert(PPRRange.End == ZPRRange.Begin &&
            "Expected ZPR callee saves after PPR locals");
-    allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs,
+    allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
                        MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs);
     CFAOffset += SVEAllocs.AfterPPRs;
@@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
   SEHEpilogueStartI = MBB.end();
 }
 
+void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
+                                           StackOffset Offset) {
+  // Other combinations could be supported, but are not currently needed.
+  assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
+         "expected negative offset (with optional fixed portion)");
+  Register Base = AArch64::FP;
+  if (int64_t FixedOffset = Offset.getFixed()) {
+    // If we have a negative fixed offset, we need to first subtract it in a
+    // temporary register first (to avoid briefly deallocating the scalable
+    // portion of the offset).
+    Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
+                    StackOffset::getFixed(FixedOffset), TII,
+                    MachineInstr::FrameDestroy);
+  }
+  emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
+                  StackOffset::getScalable(Offset.getScalable()), TII,
+                  MachineInstr::FrameDestroy);
+}
+
 void AArch64EpilogueEmitter::emitEpilogue() {
   MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
   if (MBB.end() != EpilogueEndI) {
@@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
       AfterCSRPopSize += ProloguePopSize;
     }
   }
+
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
 
   StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
   SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
-  MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
 
   // Deallocate the SVE area.
   if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
         (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
                                                               : AArch64::SP;
     if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
-      // TODO: Support stack realigment and variable-sized objects.
-      assert(
-          SVELayout != SVEStackLayout::Split &&
-          "unexpected stack realignment or variable sized objects with split "
-          "SVE stack objects");
-
-      Register CalleeSaveBase = AArch64::FP;
-      if (int64_t CalleeSaveBaseOffset =
-              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-        // If we have have an non-zero offset to the non-SVE CS base we need to
-        // compute the base address by subtracting the offest in a temporary
-        // register first (to avoid briefly deallocating the SVE CS).
-        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
-            &AArch64::GPR64RegClass);
-        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                        MachineInstr::FrameDestroy);
+      if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) {
+        // The offset from the frame-pointer to the start of the ZPR saves.
+        StackOffset FPOffsetZPR =
+            -SVECalleeSavesSize - PPR.LocalsSize -
+            StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
+        // Deallocate the stack space space by moving the SP to the start of the
+        // ZPR/PPR callee-save area.
+        moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR);
+      }
+      // With split SVE, the predicates are stored in a separate area above the
+      // ZPR saves, so we must adjust the stack to the start of the PPRs.
+      if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
+        // The offset from the frame-pointer to the start of the PPR saves.
+        StackOffset FPOffsetPPR = -PPR.CalleeSavesSize;
+        // Move to the start of the PPR area.
+        assert(!FPOffsetPPR.getFixed() && "expected only scalable offset");
+        emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP,
+                        FPOffsetPPR, TII, MachineInstr::FrameDestroy);
       }
-      // The code below will deallocate the stack space space by moving the SP
-      // to the start of the SVE callee-save area.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                      -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
     } else if (BaseForSVEDealloc == AArch64::SP) {
       auto NonSVELocals = StackOffset::getFixed(NumBytes);
       auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) +
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index 6e0e28324a0ac..7f297b5d337b0 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -180,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon {
 private:
   bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
 
+  /// A helper for moving the SP to a negative offset from the FP, without
+  /// deallocating any stack in the range FP to FP + Offset.
+  void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
+
   void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
                                          const DebugLoc &DL) const;
 
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index f535e0fe8b387..bb7ffb47d8dfe 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -162,63 +162,54 @@ body:             |
     RET_ReallyLR
 
 # CHECK-LABEL: name: test_allocate_split_sve_realigned
-# CHECK:       stackSize: 2080
+# CHECK:       stackSize: 1056
 
 # CHECK:      bb.0.entry:
 # CHECK:      liveins: $z0, $p0, $lr
-# CHECK:      $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
-# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
-# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK:      early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
-# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
 #
 # CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
 # CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
-# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
-# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
-# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
 #
-# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
-# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
-# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
 # CHECK-NEXT: RET_ReallyLR
 
 # ASM-LABEL: test_allocate_split_sve_realigned
-# ASM:	     sub sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  str x29, [sp, #1024]
-# ASM-NEXT:  str x30, [sp, #1032]
-# ASM-NEXT:  add x29, sp, #1024
+# ASM:	     stp	x29, x30, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  mov	x29, sp
 # ASM-NEXT:  .cfi_def_cfa w29, 16
 # ASM-NEXT:  .cfi_offset w30, -8
 # ASM-NEXT:  .cfi_offset w29, -16
 #
-# ASM:       sub sp, x29, #1024
-# ASM-NEXT:  .cfi_def_cfa wsp, 1040
-# ASM-NEXT:  ldr x30, [sp, #1032]
-# ASM-NEXT:  ldr x29, [sp, #1024]
-# ASM-NEXT:  add sp, sp, #1040
+# ASM:       mov	sp, x29
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
+# ASM-NEXT:  ldp	x29, x30, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w30
 # ASM-NEXT:  .cfi_restore w29
 
-# UNWINDINFO:       DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO:       DW_CFA_def_cfa_offset: +16
 # UNWINDINFO:       DW_CFA_def_cfa: reg29 +16
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg30 -8
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
 #
-# UNWINDINFO:       DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO:       DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO:       DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT:  DW_CFA_restore: reg30
 # UNWINDINFO-NEXT:  DW_CFA_restore: reg29
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index c13dd33865c37..f65aec6665cec 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -737,36 +737,23 @@ entry:
 }
 declare ptr @memset(ptr, i32, i32)
 
-; FIXME: aarch64-split-sve-objects is currently not supported in this function
-; as it requires stack reealignment (for the 32-byte aligned alloca).
-; GPR CSRs
-; <hazard padding>
-; FPR CSRs
-; <hazrd padding>
-; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
-; <realignment padding>
-; -> sp
 define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: zpr_and_ppr_local_realignment:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #1040
-; CHECK-NEXT:    sub x9, sp, #1040
-; CHECK-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK-NEXT:    add x29, sp, #1024
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #2064
+; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    addvl x9, x9, #-2
-; CHECK-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
 ; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    sub x8, x29, #1024
-; CHECK-NEXT:    str p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
 ; CHECK-NEXT:    str x0, [sp]
-; CHECK-NEXT:    sub sp, x29, #1024
-; CHECK-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
   %zpr_local = alloca <vscale x 16 x i8>
@@ -805,3 +792,316 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
   store volatile i64 %gpr, ptr %gpr_local
   ret void
 }
+
+; Only PPR callee-saves + a VLA
+; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated
+; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`.
+define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+  ret void
+}
+
+; Only ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated
+; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`.
+define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_zpr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #1056
+; CHECK-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #1024
+; CHECK-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #1040] // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x8, #-3
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #1024
+; CHECK-NEXT:    ldr x19, [sp, #1040] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1056
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+  ret void
+}
+
+; PPR+ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves allocated separately, with hazard padding of 1024 between the
+; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves
+; restored by `FP + addvl #-1`.
+define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: zpr_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x8, #-4
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+  ret void
+}
+
+; Only PPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with
+; hazard padding after the PPR callee saves (1024) and after the FPR local area
+; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to
+; `FP + addvl #-1`.
+define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-9, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-3, mul vl]
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with
+; hazard padding before the ZPR callee saves (1024) and after the ZPR local area
+; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`.
+define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_zpr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-5, mul vl]
+; CHECK-NEXT:    addvl sp, x8, #-4
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves, with hazard padding before the ZPR callee saves (1024) and after
+; the ZPR local area (1024). ZPRs restored by moving the SP to
+; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`.
+define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-9, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-6, mul vl]
+; CHECK-NEXT:    addvl sp, x8, #-5
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index bdee359487ce6..70874761b82ab 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -3512,14 +3512,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ;
 ; CHECK64-LABEL: svecc_call_dynamic_alloca:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
 ; CHECK64-NEXT:    .cfi_offset w19, -8
 ; CHECK64-NEXT:    .cfi_offset w20, -16
@@ -3529,7 +3528,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3542,30 +3541,32 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    mov w2, w1
@@ -3595,22 +3596,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -3623,21 +3633,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w20
@@ -3649,305 +3650,444 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_dynamic_alloca:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    mov w8, w0
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov w8, w8
-; CHECK1024-NEXT:    mov x9, sp
-; CHECK1024-NEXT:    mov x20, x0
-; CHECK1024-NEXT:    add x8, x8, #15
-; CHECK1024-NEXT:    and x8, x8, #0x1fffffff0
-; CHECK1024-NEXT:    sub x8, x9, x8
-; CHECK1024-NEXT:    mov sp, x8
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB35_2: // %entry
-; CHECK1024-NEXT:    mov x0, x8
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w20, #0, .LBB35_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB35_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w20
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
-entry:
-  %ptr = alloca i8, i32 %P1
-  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
-  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
-  ret i32 -396142473
-}
-
-
-define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: svecc_call_realign:
-; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_def_cfa_offset 64
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x29, sp
-; CHECK0-NEXT:    .cfi_def_cfa w29, 64
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w26, -16
-; CHECK0-NEXT:    .cfi_offset w27, -24
-; CHECK0-NEXT:    .cfi_offset w28, -32
-; CHECK0-NEXT:    .cfi_offset vg, -48
-; CHECK0-NEXT:    .cfi_offset w30, -56
-; CHECK0-NEXT:    .cfi_offset w29, -64
-; CHECK0-NEXT:    addvl sp, sp, #-18
-; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
-; CHECK0-NEXT:    sub x9, sp, #1024
-; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK0-NEXT:    mov w2, w1
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    //APP
-; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
-; CHECK0-NEXT:  // %bb.1: // %entry
-; CHECK0-NEXT:    smstop sm
-; CHECK0-NEXT:  .LBB36_2: // %entry
-; CHECK0-NEXT:    mov x0, sp
-; CHECK0-NEXT:    mov w1, #45 // =0x2d
-; CHECK0-NEXT:    bl memset
-; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
-; CHECK0-NEXT:  // %bb.3: // %entry
-; CHECK0-NEXT:    smstart sm
-; CHECK0-NEXT:  .LBB36_4: // %entry
-; CHECK0-NEXT:    mov w0, #22647 // =0x5877
-; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    addvl sp, x29, #-18
-; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    .cfi_restore z8
-; CHECK0-NEXT:    .cfi_restore z9
-; CHECK0-NEXT:    .cfi_restore z10
-; CHECK0-NEXT:    .cfi_restore z11
-; CHECK0-NEXT:    .cfi_restore z12
-; CHECK0-NEXT:    .cfi_restore z13
-; CHECK0-NEXT:    .cfi_restore z14
-; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    mov sp, x29
-; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
-; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
-; CHECK0-NEXT:    .cfi_def_cfa_offset 0
-; CHECK0-NEXT:    .cfi_restore w19
-; CHECK0-NEXT:    .cfi_restore w26
-; CHECK0-NEXT:    .cfi_restore w27
-; CHECK0-NEXT:    .cfi_restore w28
-; CHECK0-NEXT:    .cfi_restore vg
-; CHECK0-NEXT:    .cfi_restore w30
-; CHECK0-NEXT:    .cfi_restore w29
-; CHECK0-NEXT:    ret
-;
-; CHECK64-LABEL: svecc_call_realign:
-; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
-; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
-; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT:    mov w8, w0
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov w8, w8
+; CHECK1024-NOSPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov x20, x0
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov sp, x8
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB35_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB35_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w20
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-SPLITSVE-NEXT:    mov w8, w0
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov w8, w8
+; CHECK1024-SPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-SPLITSVE-NEXT:    mov x20, x0
+; CHECK1024-SPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x9, x8
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x8
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB35_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB35_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w20
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 %P1
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_realign:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
+; CHECK0-NEXT:    sub x9, sp, #1024
+; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov x19, x0
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB36_2: // %entry
+; CHECK0-NEXT:    mov x0, sp
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB36_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_realign:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w26, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3960,30 +4100,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub x9, sp, #1088
 ; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK64-NEXT:    mov w2, w1
@@ -4006,22 +4148,31 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4034,20 +4185,11 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w26
@@ -4058,140 +4200,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_realign:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub x9, sp, #2048
-; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov x19, x0
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB36_2: // %entry
-; CHECK1024-NEXT:    mov x0, sp
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w19, #0, .LBB36_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB36_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub x9, sp, #2048
+; CHECK1024-NOSPLITSVE-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB36_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB36_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub x9, sp, #2048
+; CHECK1024-SPLITSVE-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-SPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB36_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB36_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %ptr = alloca i8, i32 1000, align 32
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
@@ -4311,13 +4583,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ;
 ; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
-; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x26, x20, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
+; CHECK64-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -4330,41 +4601,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w20, -24
-; CHECK64-NEXT:    .cfi_offset w26, -32
-; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w20, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
 ; CHECK64-NEXT:    .cfi_offset w28, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128
 ; CHECK64-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK64-NEXT:    ubfiz x8, x0, #2, #32
 ; CHECK64-NEXT:    mov x9, sp
@@ -4385,22 +4658,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4413,131 +4687,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    ldp x20, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w20, -24
-; CHECK1024-NEXT:    .cfi_offset w26, -32
-; CHECK1024-NEXT:    .cfi_offset w27, -40
-; CHECK1024-NEXT:    .cfi_offset w28, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
-; CHECK1024-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK1024-NEXT:    ubfiz x8, x0, #2, #32
-; CHECK1024-NEXT:    mov x9, sp
-; CHECK1024-NEXT:    add x8, x8, #15
-; CHECK1024-NEXT:    and x8, x8, #0x7fffffff0
-; CHECK1024-NEXT:    sub x20, x9, x8
-; CHECK1024-NEXT:    mov sp, x20
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    add x0, x19, #8
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    sub x0, x29, #1024
-; CHECK1024-NEXT:    addvl x0, x0, #-19
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    mov x0, x20
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-NOSPLITSVE-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-NOSPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-NOSPLITSVE-NEXT:    sub x20, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov sp, x20
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    add x0, x19, #8
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    sub x0, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    addvl x0, x0, #-19
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x20
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-SPLITSVE-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-SPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-SPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-SPLITSVE-NEXT:    sub x20, x9, x8
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x20
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    add x0, x19, #8
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    sub x0, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl x0, x0, #-19
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x20
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca i32, i32 10
   %b = alloca <vscale x 4 x i32>

From 389aa46a99405c5c003c309ce1b0defbdebf6b23 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Tue, 4 Nov 2025 09:12:35 -0500
Subject: [PATCH 165/313] [mlir][DataFlow] Add visitBlockTransfer hook to dense
 analyses (#166263)

Add a customizable `visitBlockTransfer` method to dense forward and
backward dataflow analyses, allowing subclasses to customize lattice
propagation behavior along control flow edges between blocks. Default
implementation preserves existing join/meet semantics.

This change mirrors the exiting structure of both dense dataflow
classes, where `RegionBranchOpInterface` and callables are allowed to be
customized by subclasses.

The use case motivating this change is dense liveness analysis.
Currently, without the customization hook the block transfer function
produces incorrect results. The issue is the current logic doesn't
remove the successor block arguments from the live set, as it only meets
the successor state with the predecessor state (ie. set union).
With this change is now possible to compute the correct result by
specifying the correct logic in `visitBlockTransfer`.

Signed-off-by: Fabian Mora <fabian.mora-cordero@amd.com>
---
 .../mlir/Analysis/DataFlow/DenseAnalysis.h    | 69 +++++++++++++++++++
 mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp  |  9 ++-
 2 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
index 3c87c453a4cf0..5b7b45fdd1d58 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -127,6 +127,18 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis {
   /// them into the same equivalent class.
   virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {}
 
+  /// Visit a block and propagate the dense lattice forward along the control
+  /// flow edge from predecessor to block. `point` corresponds to the program
+  /// point before `block`. The default implementation merges in the state from
+  /// the predecessor's terminator.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *predecessor,
+                                  const AbstractDenseLattice &before,
+                                  AbstractDenseLattice *after) {
+    // Merge in the state from the predecessor's terminator.
+    join(after, before);
+  }
+
   /// Propagate the dense lattice forward along the control flow edge from
   /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt`
   /// values correspond to control flow branches originating at or targeting the
@@ -259,6 +271,22 @@ class DenseForwardDataFlowAnalysis
         branch, regionFrom, regionTo, before, after);
   }
 
+  /// Hook for customizing the behavior of lattice propagation along the control
+  /// flow edges between blocks. The control flows from `predecessor` to
+  /// `block`. The lattice is propagated forward along this edge. The lattices
+  /// are as follows:
+  ///   - `before` is the lattice at the end of the predecessor block;
+  ///   - `after` is the lattice at the beginning of the block.
+  /// By default, the `after` state is simply joined with the `before` state.
+  /// Concrete analyses can override this behavior or delegate to the parent
+  /// call for the default behavior.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *predecessor, const LatticeT &before,
+                                  LatticeT *after) {
+    AbstractDenseForwardDataFlowAnalysis::visitBlockTransfer(
+        block, point, predecessor, before, after);
+  }
+
 protected:
   /// Get the dense lattice on this lattice anchor.
   LatticeT *getLattice(LatticeAnchor anchor) override {
@@ -306,6 +334,13 @@ class DenseForwardDataFlowAnalysis
                                          static_cast<const LatticeT &>(before),
                                          static_cast<LatticeT *>(after));
   }
+  void visitBlockTransfer(Block *block, ProgramPoint *point, Block *predecessor,
+                          const AbstractDenseLattice &before,
+                          AbstractDenseLattice *after) final {
+    visitBlockTransfer(block, point, predecessor,
+                       static_cast<const LatticeT &>(before),
+                       static_cast<LatticeT *>(after));
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -388,6 +423,17 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis {
   /// them into the same equivalent class.
   virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {}
 
+  /// Visit a block and propagate the dense lattice backward along the control
+  /// flow edge from successor to block. `point` corresponds to the program
+  /// point after `block`. The default implementation merges in the state from
+  /// the successor's first operation or the block itself when empty.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *successor,
+                                  const AbstractDenseLattice &after,
+                                  AbstractDenseLattice *before) {
+    meet(before, after);
+  }
+
   /// Propagate the dense lattice backwards along the control flow edge from
   /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt`
   /// values correspond to control flow branches originating at or targeting the
@@ -531,6 +577,22 @@ class DenseBackwardDataFlowAnalysis
         branch, regionFrom, regionTo, after, before);
   }
 
+  /// Hook for customizing the behavior of lattice propagation along the control
+  /// flow edges between blocks. The control flows from `successor` to
+  /// `block`. The lattice is propagated back along this edge. The lattices
+  /// are as follows:
+  ///   - `after` is the lattice at the beginning of the successor block;
+  ///   - `before` is the lattice at the end of the block.
+  /// By default, the `before` state is simply met with the `after` state.
+  /// Concrete analyses can override this behavior or delegate to the parent
+  /// call for the default behavior.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *successor, const LatticeT &after,
+                                  LatticeT *before) {
+    AbstractDenseBackwardDataFlowAnalysis::visitBlockTransfer(
+        block, point, successor, after, before);
+  }
+
 protected:
   /// Get the dense lattice at the given lattice anchor.
   LatticeT *getLattice(LatticeAnchor anchor) override {
@@ -577,6 +639,13 @@ class DenseBackwardDataFlowAnalysis
                                          static_cast<const LatticeT &>(after),
                                          static_cast<LatticeT *>(before));
   }
+  void visitBlockTransfer(Block *block, ProgramPoint *point, Block *successor,
+                          const AbstractDenseLattice &after,
+                          AbstractDenseLattice *before) final {
+    visitBlockTransfer(block, point, successor,
+                       static_cast<const LatticeT &>(after),
+                       static_cast<LatticeT *>(before));
+  }
 };
 
 } // end namespace dataflow
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index 0682e5f26785a..22bc0b32a9bd1 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -266,9 +266,10 @@ void AbstractDenseForwardDataFlowAnalysis::visitBlock(Block *block) {
     }
 
     LDBG() << "    Joining state from predecessor " << predecessor;
+    const AbstractDenseLattice &before = *getLatticeFor(
+        point, getProgramPointAfter(predecessor->getTerminator()));
     // Merge in the state from the predecessor's terminator.
-    join(after, *getLatticeFor(
-                    point, getProgramPointAfter(predecessor->getTerminator())));
+    visitBlockTransfer(block, point, predecessor, before, after);
   }
 }
 
@@ -614,7 +615,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) {
     LDBG() << "    Meeting state from successor " << successor;
     // Merge in the state from the successor: either the first operation, or the
     // block itself when empty.
-    meet(before, *getLatticeFor(point, getProgramPointBefore(successor)));
+    visitBlockTransfer(block, point, successor,
+                       *getLatticeFor(point, getProgramPointBefore(successor)),
+                       before);
   }
 }
 

From ed45c0571eb35339f7c3562edbb3b27d67594acd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com>
Date: Tue, 4 Nov 2025 16:22:00 +0200
Subject: [PATCH 166/313] [MLIR][XeGPU] fix load/store/prefetch op offset
 verifier (#166137)

The verifier of `xegpu.{load/store/prefetch}_nd` op fails if `offset` a
mix of static and dynamic values, e.g. `offset = [0, %c0]`. In this case
the length of dynamic offsets is 1 and the check `offsetSize !=
tDescRank` (=2) fails. Instead, we should check the length of
`getMixedOffsets()`.
---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 21 ++++++---------------
 mlir/test/Dialect/XeGPU/ops.mlir       |  9 +++++++++
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index c8f5c86c03686..fb51077b5dff3 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -498,11 +498,8 @@ LogicalResult PrefetchNdOp::verify() {
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
   int64_t tDescRank = tdescTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -623,11 +620,8 @@ LogicalResult LoadNdOp::verify() {
                          << tdescTy;
 
   int64_t tDescRank = tdescTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -717,11 +711,8 @@ LogicalResult StoreNdOp::verify() {
                          << dstTy;
 
   int64_t tDescRank = dstTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 0a10f6814ae96..9b3829664108d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -278,6 +278,15 @@ gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : in
   gpu.return
 }
 
+// CHECK: func @subgroup_load_nd_offset_2(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index) {
+gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+  %2 = xegpu.load_nd %1[%x, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+  gpu.return
+}
+
 // CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>

From a01e4da6d64320bf308ef1596ec4d630772616c9 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Tue, 4 Nov 2025 09:29:26 -0500
Subject: [PATCH 167/313] [PowerPC] Ensure correct codgen for MMA functions for
 cpu=future (#165791)

Update MMA tests to add run line for `cpu=future` to ensure MMA
functionality is not broken with the new `wacc` register classes
introduced. Previous commit have added def for using the new `wacc`
registers, this just add in testing and fixes a few patterns that was
missing .
---
 llvm/lib/Target/PowerPC/PPCInstrMMA.td        |    4 +-
 .../CodeGen/PowerPC/mma-acc-copy-hints.ll     |   91 ++
 llvm/test/CodeGen/PowerPC/mma-acc-memops.ll   |  170 +++
 llvm/test/CodeGen/PowerPC/mma-acc-spill.ll    |  102 ++
 .../mma-integer-based-outer-product.ll        |  166 +++
 llvm/test/CodeGen/PowerPC/mma-intrinsics.ll   |  517 +++++++
 .../test/CodeGen/PowerPC/mma-outer-product.ll | 1266 +++++++++++++++++
 llvm/test/CodeGen/PowerPC/mma-phi-accs.ll     |  202 +++
 .../PowerPC/peephole-mma-phi-liveness.ll      |   39 +
 9 files changed, 2555 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index b38dd4ae948c6..fc3cde3f464bb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
     RegConstraint<"@earlyclobber $AT">;
   def PM#NAME#WPP :
     MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x20), (outs acc:$AT),
+      opcode, !or(xo, 0x20), (outs wacc:$AT),
       !con((ins wacc:$ATi),
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
@@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in {
   def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
             (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>;
   def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+            (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>;
   def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
             (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>;
 
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
index 7e2f744ac1d71..94121f09e36be 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 {
 ; CHECK-LABEL: testMultiply:
@@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound
 ; CHECK-BE-NEXT:    ld r30, -16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: testMultiply:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    mflr r0
+; CHECK-LE-WACC-NEXT:    std r30, -16(r1)
+; CHECK-LE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    clrldi r0, r1, 59
+; CHECK-LE-WACC-NEXT:    subfic r0, r0, -128
+; CHECK-LE-WACC-NEXT:    mr r30, r1
+; CHECK-LE-WACC-NEXT:    stdux r1, r1, r0
+; CHECK-LE-WACC-NEXT:    stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    lxv v31, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v30, 0(r4)
+; CHECK-LE-WACC-NEXT:    addi r3, r1, 32
+; CHECK-LE-WACC-NEXT:    std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    vmr v2, v31
+; CHECK-LE-WACC-NEXT:    vmr v3, v30
+; CHECK-LE-WACC-NEXT:    mr r29, r5
+; CHECK-LE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc
+; CHECK-LE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
+; CHECK-LE-WACC-NEXT:    lxv vs0, 48(r1)
+; CHECK-LE-WACC-NEXT:    lxv vs1, 32(r1)
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, vs1, vs0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v5, 0(r29)
+; CHECK-LE-WACC-NEXT:    pstxv v4, 8(r29), 0
+; CHECK-LE-WACC-NEXT:    stxv v3, 16(r29)
+; CHECK-LE-WACC-NEXT:    pstxv v2, 24(r29), 0
+; CHECK-LE-WACC-NEXT:    lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    mr r1, r30
+; CHECK-LE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    ld r30, -16(r1)
+; CHECK-LE-WACC-NEXT:    mtlr r0
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testMultiply:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r30, -16(r1)
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    clrldi r0, r1, 59
+; CHECK-BE-WACC-NEXT:    subfic r0, r0, -224
+; CHECK-BE-WACC-NEXT:    mr r30, r1
+; CHECK-BE-WACC-NEXT:    stdux r1, r1, r0
+; CHECK-BE-WACC-NEXT:    stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    lxv v31, 0(r3)
+; CHECK-BE-WACC-NEXT:    lxv v30, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r3, r1, 128
+; CHECK-BE-WACC-NEXT:    std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    vmr v2, v31
+; CHECK-BE-WACC-NEXT:    vmr v3, v30
+; CHECK-BE-WACC-NEXT:    mr r29, r5
+; CHECK-BE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
+; CHECK-BE-WACC-NEXT:    lxv vs0, 128(r1)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 144(r1)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    vmr v1, v2
+; CHECK-BE-WACC-NEXT:    vmr v7, v4
+; CHECK-BE-WACC-NEXT:    vmr v0, v3
+; CHECK-BE-WACC-NEXT:    vmr v6, v5
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r29)
+; CHECK-BE-WACC-NEXT:    pstxv v3, 8(r29), 0
+; CHECK-BE-WACC-NEXT:    stxv v4, 16(r29)
+; CHECK-BE-WACC-NEXT:    pstxv v5, 24(r29), 0
+; CHECK-BE-WACC-NEXT:    lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    mr r1, r30
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    ld r30, -16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %vP = alloca <256 x i1>, align 32
   call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP)
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index 059d60a9608f8..bc5d5bed36e9b 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -3,10 +3,18 @@
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
 ; RUN:   -disable-auto-paired-vec-st=false < %s | FileCheck %s \
 ; RUN:   --check-prefix=LE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -disable-auto-paired-vec-st=false < %s | FileCheck %s \
+; RUN:   --check-prefix=LE-PAIRED-WACC
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
 ; RUN:   FileCheck %s --check-prefix=BE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
+; RUN:   FileCheck %s --check-prefix=BE-PAIRED-WACC
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \
 ; RUN:   | FileCheck %s --check-prefix=LE-PWR9
@@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    pstxv vs3, f@PCREL+128(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv v3, f@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v5, f@PCREL+96(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v2, f@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v4, f@PCREL+112(0), 1
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    pstxv v4, f@PCREL+176(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v5, f@PCREL+160(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v2, f@PCREL+144(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v3, f@PCREL+128(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, f@toc@ha
@@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs2, 160(r3)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT:    lxv v3, 112(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v5, 80(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v2, 96(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v4, 64(r3)
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    stxv v5, 176(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v4, 160(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v3, 144(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v2, 128(r3)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, f@toc@ha
@@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    stxv vs2, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testXLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    paddi r5, 0, f@PCREL, 1
+; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 6
+; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxv v2, 16(r6)
+; LE-PAIRED-WACC-NEXT:    lxv v5, 32(r6)
+; LE-PAIRED-WACC-NEXT:    lxv v4, 48(r6)
+; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 6
+; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    stxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxv v4, 48(r4)
+; LE-PAIRED-WACC-NEXT:    stxv v5, 32(r4)
+; LE-PAIRED-WACC-NEXT:    stxv v2, 16(r4)
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testXLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, f@toc@ha
@@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs2, 32(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testXLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r5, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r5, r5, f@toc@l
+; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 6
+; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxv v5, 48(r6)
+; BE-PAIRED-WACC-NEXT:    lxv v3, 16(r6)
+; BE-PAIRED-WACC-NEXT:    lxv v4, 32(r6)
+; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 6
+; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    stxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxv v5, 48(r4)
+; BE-PAIRED-WACC-NEXT:    stxv v4, 32(r4)
+; BE-PAIRED-WACC-NEXT:    stxv v3, 16(r4)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testXLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r5, r2, f@toc@ha
@@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() {
 ; LE-PAIRED-NEXT:    pstxv vs3, f@PCREL+19(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv v3, f@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v5, f@PCREL+43(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v2, f@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v4, f@PCREL+59(0), 1
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    pstxv v4, f@PCREL+67(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v5, f@PCREL+51(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v2, f@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v3, f@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testUnalignedLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, f@toc@ha
@@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() {
 ; BE-PAIRED-NEXT:    pstxv vs2, 51(r3), 0
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT:    plxv v3, 59(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v5, 27(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v2, 43(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v4, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    pstxv v5, 67(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v4, 51(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v3, 35(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v2, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testUnalignedLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, f@toc@ha
@@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    pstxv vs1, g@PCREL+64(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv vs0, g@PCREL+48(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv vs1, g@PCREL+32(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs0, g@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs1, g@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, g@toc@ha
@@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs0, 64(r3)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT:    lxv vs0, 48(r3)
+; BE-PAIRED-WACC-NEXT:    lxv vs1, 32(r3)
+; BE-PAIRED-WACC-NEXT:    stxv vs0, 80(r3)
+; BE-PAIRED-WACC-NEXT:    stxv vs1, 64(r3)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
@@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    stxv vs1, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testXLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
+; LE-PAIRED-WACC-NEXT:    paddi r5, 0, g@PCREL, 1
+; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
+; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testXLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, g@toc@ha
@@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs1, 16(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testXLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r5, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
+; BE-PAIRED-WACC-NEXT:    addi r5, r5, g@toc@l
+; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
+; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testXLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r5, r2, g@toc@ha
@@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() {
 ; LE-PAIRED-NEXT:    pstxv vs1, g@PCREL+19(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv vs0, g@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv vs1, g@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs0, g@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs1, g@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testUnalignedLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, g@toc@ha
@@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() {
 ; BE-PAIRED-NEXT:    pstxv vs0, 19(r3), 0
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT:    plxv vs0, 27(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv vs1, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv vs0, 35(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv vs1, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testUnalignedLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
index abc65bed5bf6c..9db8ba1c9eb09 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
@@ -13,6 +13,13 @@
 ; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
 
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
+
 declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare void @foo()
@@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-BE-NEXT:    ld r0, 16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: intrinsics1:
+; CHECK-LE-WACC:       # %bb.0:
+; CHECK-LE-WACC-NEXT:    mflr r0
+; CHECK-LE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    stdu r1, -176(r1)
+; CHECK-LE-WACC-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-LE-WACC-NEXT:    .cfi_offset lr, 16
+; CHECK-LE-WACC-NEXT:    .cfi_offset r30, -16
+; CHECK-LE-WACC-NEXT:    .cfi_offset v28, -80
+; CHECK-LE-WACC-NEXT:    .cfi_offset v29, -64
+; CHECK-LE-WACC-NEXT:    .cfi_offset v30, -48
+; CHECK-LE-WACC-NEXT:    .cfi_offset v31, -32
+; CHECK-LE-WACC-NEXT:    stxv v28, 96(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v29, 112(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v30, 128(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v31, 144(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    vmr v31, v5
+; CHECK-LE-WACC-NEXT:    vmr v29, v3
+; CHECK-LE-WACC-NEXT:    vmr v30, v4
+; CHECK-LE-WACC-NEXT:    vmr v28, v2
+; CHECK-LE-WACC-NEXT:    std r30, 160(r1) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    ld r30, 272(r1)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-LE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v4
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-LE-WACC-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-LE-WACC-NEXT:    bl foo@notoc
+; CHECK-LE-WACC-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-LE-WACC-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvf16ger2pp wacc0, v28, v30
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r30)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r30)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r30)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r30)
+; CHECK-LE-WACC-NEXT:    lxv v31, 144(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v30, 128(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v29, 112(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v28, 96(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    ld r30, 160(r1) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    addi r1, r1, 176
+; CHECK-LE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    mtlr r0
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    stdu r1, -256(r1)
+; CHECK-BE-WACC-NEXT:    .cfi_def_cfa_offset 256
+; CHECK-BE-WACC-NEXT:    .cfi_offset lr, 16
+; CHECK-BE-WACC-NEXT:    .cfi_offset r30, -16
+; CHECK-BE-WACC-NEXT:    .cfi_offset v28, -80
+; CHECK-BE-WACC-NEXT:    .cfi_offset v29, -64
+; CHECK-BE-WACC-NEXT:    .cfi_offset v30, -48
+; CHECK-BE-WACC-NEXT:    .cfi_offset v31, -32
+; CHECK-BE-WACC-NEXT:    stxv v28, 176(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v29, 192(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v30, 208(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v31, 224(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    vmr v31, v5
+; CHECK-BE-WACC-NEXT:    vmr v29, v3
+; CHECK-BE-WACC-NEXT:    vmr v30, v4
+; CHECK-BE-WACC-NEXT:    vmr v28, v2
+; CHECK-BE-WACC-NEXT:    std r30, 240(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    ld r30, 368(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT:    stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT:    bl foo
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v28, v30
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r30)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r30)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r30)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r30)
+; CHECK-BE-WACC-NEXT:    lxv v31, 224(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v30, 208(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v29, 192(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v28, 176(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    ld r30, 240(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    addi r1, r1, 256
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
   %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4)
   %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3)
   tail call void @foo()
diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
index e932aec2c7134..7b36fa4f64f71 100644
--- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 ; Function Attrs: nofree nounwind writeonly
 define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) {
@@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test1:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    xvi16ger2 wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test2:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test3:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvi8ger4spp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test4:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvi16ger2pp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test5:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test6:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 8fbc9d785796d..3505cbb197bf9 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 ; assemble_acc
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
@@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: ass_acc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: ass_acc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %ptr, align 64
@@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxmtacc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmtacc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
 ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
 ; generated from the call to xxmtacc then one xxmfacc is generated for the store
@@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxmfacc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmfacc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
 ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
 ; generated from the call to xxmfacc then one xxmfacc is generated for the store
@@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxsetaccz:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxsetaccz:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   store <512 x i1> %0, ptr %ptr, align 64
@@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
 ; CHECK-BE-NEXT:    stxv vs2, 0(r5)
 ; CHECK-BE-NEXT:    stxv vs3, 0(r6)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: disass_acc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 0(r4)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    stxv v2, 0(r6)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: disass_acc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
@@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testBranch:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmplwi r7, 0
+; CHECK-WACC-NEXT:    beq cr0, .LBB5_2
+; CHECK-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    b .LBB5_3
+; CHECK-WACC-NEXT:  .LBB5_2: # %if.else
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT:  .LBB5_3: # %if.end
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testBranch:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmplwi r7, 0
+; CHECK-BE-WACC-NEXT:    beq cr0, .LBB5_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    b .LBB5_3
+; CHECK-BE-WACC-NEXT:  .LBB5_2: # %if.else
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:  .LBB5_3: # %if.end
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %tobool = icmp eq i32 %val, 0
   br i1 %tobool, label %if.else, label %if.then
@@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
 ; CHECK-BE-NEXT:    bdnz .LBB9_2
 ; CHECK-BE-NEXT:  # %bb.3: # %for.cond.cleanup
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse4:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    bltlr cr0
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r4, r4, 32
+; CHECK-WACC-NEXT:    mtctr r4
+; CHECK-WACC-NEXT:    li r4, 0
+; CHECK-WACC-NEXT:    li r6, 0
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB9_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    rldic r7, r6, 4, 28
+; CHECK-WACC-NEXT:    add r8, r5, r7
+; CHECK-WACC-NEXT:    lxvx vs0, r5, r7
+; CHECK-WACC-NEXT:    lxv vs1, 16(r8)
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc2
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc2, vs0, vs1
+; CHECK-WACC-NEXT:    lxv vs0, 32(r8)
+; CHECK-WACC-NEXT:    lxv vs1, 48(r8)
+; CHECK-WACC-NEXT:    rldic r7, r4, 6, 26
+; CHECK-WACC-NEXT:    addi r4, r4, 3
+; CHECK-WACC-NEXT:    addi r6, r6, 6
+; CHECK-WACC-NEXT:    xvf32gerpn wacc1, vs0, vs1
+; CHECK-WACC-NEXT:    lxv vs0, 64(r8)
+; CHECK-WACC-NEXT:    lxv vs1, 80(r8)
+; CHECK-WACC-NEXT:    add r8, r3, r7
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, vs0, vs1
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-WACC-NEXT:    stxvx v3, r3, r7
+; CHECK-WACC-NEXT:    stxv v4, 48(r8)
+; CHECK-WACC-NEXT:    stxv v5, 32(r8)
+; CHECK-WACC-NEXT:    stxv v2, 16(r8)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r8)
+; CHECK-WACC-NEXT:    stxv v5, 96(r8)
+; CHECK-WACC-NEXT:    stxv v2, 80(r8)
+; CHECK-WACC-NEXT:    stxv v3, 64(r8)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 176(r8)
+; CHECK-WACC-NEXT:    stxv v5, 160(r8)
+; CHECK-WACC-NEXT:    stxv v2, 144(r8)
+; CHECK-WACC-NEXT:    stxv v3, 128(r8)
+; CHECK-WACC-NEXT:    bdnz .LBB9_2
+; CHECK-WACC-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    bltlr cr0
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r4, r4, 32
+; CHECK-BE-WACC-NEXT:    mtctr r4
+; CHECK-BE-WACC-NEXT:    li r4, 0
+; CHECK-BE-WACC-NEXT:    li r6, 0
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB9_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    rldic r7, r6, 4, 28
+; CHECK-BE-WACC-NEXT:    add r8, r5, r7
+; CHECK-BE-WACC-NEXT:    lxvx vs0, r5, r7
+; CHECK-BE-WACC-NEXT:    lxv vs1, 16(r8)
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc2
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc2, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxv vs0, 32(r8)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 48(r8)
+; CHECK-BE-WACC-NEXT:    rldic r7, r4, 6, 26
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 3
+; CHECK-BE-WACC-NEXT:    addi r6, r6, 6
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc1, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxv vs0, 64(r8)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 80(r8)
+; CHECK-BE-WACC-NEXT:    add r8, r3, r7
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-BE-WACC-NEXT:    stxvx v2, r3, r7
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r8)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r8)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r8)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 176(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 160(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 144(r8)
+; CHECK-BE-WACC-NEXT:    stxv v2, 128(r8)
+; CHECK-BE-WACC-NEXT:    bdnz .LBB9_2
+; CHECK-BE-WACC-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %cmp55 = icmp sgt i32 %lim, 0
   br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
@@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
 ; CHECK-BE-NEXT:    ld r0, 16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    mflr r0
+; CHECK-WACC-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-WACC-NEXT:    std r0, 16(r1)
+; CHECK-WACC-NEXT:    stdu r1, -112(r1)
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v0, 48(r3)
+; CHECK-WACC-NEXT:    stxv v1, 32(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-WACC-NEXT:    mr r30, r3
+; CHECK-WACC-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-WACC-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-WACC-NEXT:    bl testRedundantPrimeUnprimeF@notoc
+; CHECK-WACC-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-WACC-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r30)
+; CHECK-WACC-NEXT:    stxv v5, 96(r30)
+; CHECK-WACC-NEXT:    stxv v2, 80(r30)
+; CHECK-WACC-NEXT:    stxv v3, 64(r30)
+; CHECK-WACC-NEXT:    addi r1, r1, 112
+; CHECK-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-WACC-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-WACC-NEXT:    mtlr r0
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    stdu r1, -192(r1)
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    std r30, 176(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v1, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v0, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r3)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT:    mr r30, r3
+; CHECK-BE-WACC-NEXT:    stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT:    stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT:    bl testRedundantPrimeUnprimeF
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r30)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r30)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r30)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r30)
+; CHECK-BE-WACC-NEXT:    ld r30, 176(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    addi r1, r1, 192
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   store <512 x i1> %0, ptr %dst, align 64
@@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    plxvp vsp36, 8(r4), 0
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    plxvp vsp36, 8(r4), 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = getelementptr i8, ptr %vpp, i64 8
@@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
@@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x
 ; CHECK-BE-NEXT:    stxv vs3, 48(r9)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r9)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r5)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r9)
+; CHECK-WACC-NEXT:    stxv v5, 32(r9)
+; CHECK-WACC-NEXT:    stxv v2, 16(r9)
+; CHECK-WACC-NEXT:    stxv v3, 0(r9)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r5)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r9)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r9)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r9)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r9)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
index ac6ad41633492..ff860b8d6ff22 100644
--- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
@@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: intrinsics1:
+; CHECK-WACC:       # %bb.0:
+; CHECK-WACC-NEXT:    vmr v1, v4
+; CHECK-WACC-NEXT:    vmr v4, v3
+; CHECK-WACC-NEXT:    vmr v0, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v4
+; CHECK-WACC-NEXT:    ld r3, 96(r1)
+; CHECK-WACC-NEXT:    xvf16ger2pp wacc0, v0, v1
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    vmr v2, v5
+; CHECK-WACC-NEXT:    pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    vmr v1, v4
+; CHECK-BE-WACC-NEXT:    vmr v4, v3
+; CHECK-BE-WACC-NEXT:    vmr v0, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    ld r3, 112(r1)
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v0, v1
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    vmr v2, v5
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
   %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4)
   %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2)
   %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3)
@@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) {
 ; CHECK-BE-NEXT:    stxv vs2, 0(r5)
 ; CHECK-BE-NEXT:    stxv vs3, 0(r6)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: intrinsics2:
+; CHECK-WACC:       # %bb.0:
+; CHECK-WACC-NEXT:    lxv v2, 0(r3)
+; CHECK-WACC-NEXT:    lxv v4, 0(r5)
+; CHECK-WACC-NEXT:    lxv v3, 0(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r6)
+; CHECK-WACC-NEXT:    vmr v1, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT:    xvi8ger4pp wacc0, v2, v3
+; CHECK-WACC-NEXT:    xvf16ger2pn wacc0, v2, v4
+; CHECK-WACC-NEXT:    vmr v0, v5
+; CHECK-WACC-NEXT:    pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 0(r4)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    stxv v2, 0(r6)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics2:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    vmr v1, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4pp wacc0, v2, v3
+; CHECK-BE-WACC-NEXT:    xvf16ger2pn wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    vmr v0, v5
+; CHECK-BE-WACC-NEXT:    pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    blr
   %vc1 = load <16 x i8>, ptr %ptr1, align 16
   %vc2 = load <16 x i8>, ptr %ptr2, align 16
   %vc3 = load <16 x i8>, ptr %ptr3, align 16
@@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi4ger8 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi4ger8 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test4:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test5:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi8ger4 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi8ger4 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test6:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi8ger4pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test7:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test7:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test8:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test8:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test9:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi16ger2s wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test9:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi16ger2s wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test10:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi16ger2spp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test10:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi16ger2spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test11:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test11:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test12:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test12:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test13:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvf16ger2 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test13:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvf16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test14:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test14:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test15:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2pn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test15:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test16:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2np wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test16:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2np wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test17:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2nn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test17:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2nn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test18:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test18:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test19:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test19:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test20:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test20:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test21:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test21:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test22:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test22:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test23:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvf32ger wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test23:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvf32ger wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test24:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test24:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test25:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test25:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test26:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test26:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test27:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gernn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test27:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gernn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test28:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test28:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test29:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test29:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test30:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test30:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test31:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test31:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test32:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test32:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test33:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test33:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <256 x i1>, ptr %vpp, align 32
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc)
@@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test34:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test34:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test35:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gerpn wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test35:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gerpn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test36:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test36:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test37:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernn wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test37:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test38:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test38:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <256 x i1>, ptr %vpp, align 32
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0)
@@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test39:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test39:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test40:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test40:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test41:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test41:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test42:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test42:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
index 89e5147aecc5f..37d0e69b3beaa 100644
--- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s  --check-prefix=CHECK-WACC
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) {
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testPHI1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmpwi r5, 3
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    blt cr0, .LBB0_3
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-WACC-NEXT:    addi r5, r5, -2
+; CHECK-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-WACC-NEXT:    mtctr r5
+; CHECK-WACC-NEXT:    addi r4, r4, 32
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB0_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-WACC-NEXT:    addi r4, r4, 16
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    bdnz .LBB0_2
+; CHECK-WACC-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 48(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testPHI1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmpwi r5, 3
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB0_3
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT:    addi r5, r5, -2
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT:    mtctr r5
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 32
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB0_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 16
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    bdnz .LBB0_2
+; CHECK-BE-WACC-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <16 x i8>, ptr %Src, align 16
   %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) {
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testPHI2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-WACC-NEXT:    lxv vs0, 32(r4)
+; CHECK-WACC-NEXT:    cmpwi r5, 4
+; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    blt cr0, .LBB1_3
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-WACC-NEXT:    addi r5, r5, -3
+; CHECK-WACC-NEXT:    mtctr r5
+; CHECK-WACC-NEXT:    addi r4, r4, 48
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB1_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-WACC-NEXT:    addi r4, r4, 16
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    bdnz .LBB1_2
+; CHECK-WACC-NEXT:  .LBB1_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 48(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testPHI2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-WACC-NEXT:    cmpwi r5, 4
+; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB1_3
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT:    addi r5, r5, -3
+; CHECK-BE-WACC-NEXT:    mtctr r5
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 48
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB1_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 16
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    bdnz .LBB1_2
+; CHECK-BE-WACC-NEXT:  .LBB1_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <16 x i8>, ptr %Src, align 16
   %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) {
 ; CHECK-BE-NEXT:    xxmfacc acc0
 ; CHECK-BE-NEXT:    stxv vs3, 0(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testImplicitDef:
+; CHECK-WACC:       # %bb.0: # %label1
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-WACC-NEXT:  # %bb.1: # %label2
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:  .LBB2_2: # %label3
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testImplicitDef:
+; CHECK-BE-WACC:       # %bb.0: # %label1
+; CHECK-BE-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %label2
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:  .LBB2_2: # %label3
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 label1:
   br i1 undef, label %label3, label %label2
 
@@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun
 ; CHECK-BE-NEXT:    stxv vs3, 48(r5)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r5)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testNestedPHI:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmplwi r3, 0
+; CHECK-WACC-NEXT:    beq cr0, .LBB3_2
+; CHECK-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    bge cr0, .LBB3_3
+; CHECK-WACC-NEXT:    b .LBB3_5
+; CHECK-WACC-NEXT:  .LBB3_2:
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    blt cr0, .LBB3_5
+; CHECK-WACC-NEXT:  .LBB3_3: # %for.body.preheader
+; CHECK-WACC-NEXT:    addi r3, r4, -1
+; CHECK-WACC-NEXT:    clrldi r3, r3, 32
+; CHECK-WACC-NEXT:    addi r3, r3, 1
+; CHECK-WACC-NEXT:    mtctr r3
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB3_4: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    bdnz .LBB3_4
+; CHECK-WACC-NEXT:  .LBB3_5: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    li r3, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r5)
+; CHECK-WACC-NEXT:    stxv v5, 32(r5)
+; CHECK-WACC-NEXT:    stxv v2, 16(r5)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testNestedPHI:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmplwi r3, 0
+; CHECK-BE-WACC-NEXT:    beq cr0, .LBB3_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    bge cr0, .LBB3_3
+; CHECK-BE-WACC-NEXT:    b .LBB3_5
+; CHECK-BE-WACC-NEXT:  .LBB3_2:
+; CHECK-BE-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB3_5
+; CHECK-BE-WACC-NEXT:  .LBB3_3: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    addi r3, r4, -1
+; CHECK-BE-WACC-NEXT:    clrldi r3, r3, 32
+; CHECK-BE-WACC-NEXT:    addi r3, r3, 1
+; CHECK-BE-WACC-NEXT:    mtctr r3
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB3_4: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    bdnz .LBB3_4
+; CHECK-BE-WACC-NEXT:  .LBB3_5: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    li r3, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r5)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r5)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r5)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r5)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %tobool.not = icmp eq i32 %cond, 0
   br i1 %tobool.not, label %if.end, label %if.then
diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
index 291cf97fd009e..929bf5f61dd90 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
+++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN:   -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC
 
 target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
 
@@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    xxswapd 0, 0
 ; CHECK-NEXT:    stxv 0, 0(3)
 ; CHECK-NEXT:    blr
+;
+; CHECK-WACC-LABEL: baz:
+; CHECK-WACC:       # %bb.0: # %bb
+; CHECK-WACC-NEXT:    dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT:    xxmrgld 1, 34, 36
+; CHECK-WACC-NEXT:    xxswapd 2, 1
+; CHECK-WACC-NEXT:    xxlxor 0, 0, 0
+; CHECK-WACC-NEXT:    xvnegdp 1, 1
+; CHECK-WACC-NEXT:    xvnegdp 2, 2
+; CHECK-WACC-NEXT:    xvsubdp 1, 1, 0
+; CHECK-WACC-NEXT:    xvsubdp 2, 2, 37
+; CHECK-WACC-NEXT:    xvmuldp 1, 1, 0
+; CHECK-WACC-NEXT:    xvmuldp 2, 2, 0
+; CHECK-WACC-NEXT:    xvmaddadp 1, 0, 0
+; CHECK-WACC-NEXT:    xvmaddadp 2, 0, 0
+; CHECK-WACC-NEXT:    stxv 1, 0(3)
+; CHECK-WACC-NEXT:    stxv 2, 0(3)
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    bc 12, 20, L..BB0_2
+; CHECK-WACC-NEXT:  # %bb.1: # %bb10
+; CHECK-WACC-NEXT:    xvf64gerpp 0, 34, 0
+; CHECK-WACC-NEXT:  L..BB0_2: # %bb12
+; CHECK-WACC-NEXT:    cmpdi 3, 0
+; CHECK-WACC-NEXT:    .align 4
+; CHECK-WACC-NEXT:  L..BB0_3: # %bb13
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    bc 4, 2, L..BB0_3
+; CHECK-WACC-NEXT:  # %bb.4: # %bb14
+; CHECK-WACC-NEXT:    dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT:    xxlxor 0, 0, 0
+; CHECK-WACC-NEXT:    xvsubdp 1, 0, 35
+; CHECK-WACC-NEXT:    xxlxor 2, 2, 2
+; CHECK-WACC-NEXT:    xvmaddadp 2, 1, 2
+; CHECK-WACC-NEXT:    xvadddp 0, 2, 0
+; CHECK-WACC-NEXT:    xxswapd 0, 0
+; CHECK-WACC-NEXT:    stxv 0, 0(3)
+; CHECK-WACC-NEXT:    blr
 bb:
   %call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison)
   %extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0

From 2237a18f25dc93b46d478c9c7da6a514362cb6e3 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 4 Nov 2025 16:37:16 +0200
Subject: [PATCH 168/313] [SPIRV] Enable OpenCL max_work_group_size translation
 via `SPV_INTEL_kernel_attributes` (#165891)

This adds BE support for the
[`SPV_INTEL_kernel_attributes`](https://github.khronos.org/SPIRV-Registry/extensions/INTEL/SPV_INTEL_kernel_attributes.html)
extension. The extension is necessary to encode the rather useful
`max_work_group_size` kernel attribute, via `OpExecutionMode
MaxWorkgroupSizeINTEL`, which is the only Execution Mode added by the
extension that this patch adds full processing for. Future patches will
add the other Execution Modes and Capabilities. The test is adapted from
the equivalent Translator test; it depends on #165815.
---
 llvm/docs/SPIRVUsage.rst                      |  6 ++--
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp     |  5 +++
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    |  4 ++-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |  4 +++
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 16 +++++++++-
 .../max_work_group_size.ll                    | 32 +++++++++++++++++++
 6 files changed, 63 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll

diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 85eeabf10244a..99f56a5cbc63a 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -187,6 +187,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds decorations that can be applied to global (module scope) variables.
    * - ``SPV_INTEL_global_variable_fpga_decorations``
      - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices.
+   * - ``SPV_INTEL_kernel_attributes``
+     - Adds execution modes that can be applied to entry points to inform scheduling.
    * - ``SPV_INTEL_media_block_io``
      - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image.
    * - ``SPV_INTEL_memory_access_aliasing``
@@ -226,9 +228,9 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
    * - ``SPV_INTEL_fp_max_error``
      - Adds the ability to specify the maximum error for floating-point operations.
    * - ``SPV_INTEL_ternary_bitwise_function``
-     - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. 
+     - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform.
    * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate``
-     - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. 
+     - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix.
    * - ``SPV_INTEL_int4``
      - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices.
    * - ``SPV_KHR_float_controls2``
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 640b014646f36..0175f2fb3698b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
     if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size"))
       outputExecutionModeFromMDNode(FReg, Node,
                                     SPIRV::ExecutionMode::SubgroupSize, 0, 0);
+    if (MDNode *Node = F.getMetadata("max_work_group_size")) {
+      if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes))
+        outputExecutionModeFromMDNode(
+            FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1);
+    }
     if (MDNode *Node = F.getMetadata("vec_type_hint")) {
       MCInst Inst;
       Inst.setOpcode(SPIRV::OpExecutionMode);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 96f5dee21bc2a..f0558ebcb6681 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -155,7 +155,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_INTEL_predicated_io",
          SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
         {"SPV_KHR_maximal_reconvergence",
-         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
+         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
+        {"SPV_INTEL_kernel_attributes",
+         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index db036a55ee6c6..d154a06c6f313 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -2180,6 +2180,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
           SPIRV::ExecutionMode::SubgroupSize, ST);
+    if (F.getMetadata("max_work_group_size"))
+      MAI.Reqs.getAndAddRequirements(
+          SPIRV::OperandCategory::ExecutionModeOperand,
+          SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST);
     if (F.getMetadata("vec_type_hint"))
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 7d08b29a51a6e..267118364c371 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -587,6 +587,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0,
 defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>;
+defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// TODO-SPIRV: add these once they are used / tested.
+// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// END TODO-SPIRV
 defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>;
 defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>;
 defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>;
@@ -805,6 +810,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>;
 defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
 defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
 defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
+defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>;
+// TODO-SPIRV: Add the following once they are used / tested.
+// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>;
+// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>;
+// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>;
+// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>;
+// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>;
+// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>;
+// END TODO-SPIRV
 defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
 defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
 
@@ -1919,7 +1933,7 @@ defm GenericCastToPtr :  SpecConstantOpOperandsOperand<122, [], [Kernel]>;
 defm PtrCastToGeneric :  SpecConstantOpOperandsOperand<121, [], [Kernel]>;
 defm Bitcast :  SpecConstantOpOperandsOperand<124, [], []>;
 defm QuantizeToF16 :  SpecConstantOpOperandsOperand<116, [], [Shader]>;
-// Arithmetic 
+// Arithmetic
 defm SNegate :  SpecConstantOpOperandsOperand<126, [], []>;
 defm Not :  SpecConstantOpOperandsOperand<200, [], []>;
 defm IAdd :  SpecConstantOpOperandsOperand<128, [], []>;
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
new file mode 100644
index 0000000000000..717771c965496
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %}
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability KernelAttributesINTEL
+; CHECK: OpExtension "SPV_INTEL_kernel_attributes"
+; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1"
+; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2"
+; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3"
+; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1
+; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1
+; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4
+; CHECK: %[[DIM1]] = OpFunction
+; CHECK: %[[DIM2]] = OpFunction
+; CHECK: %[[DIM3]] = OpFunction
+
+define spir_kernel void @Dim1() !max_work_group_size !0 {
+  ret void
+}
+
+define spir_kernel void @Dim2() !max_work_group_size !1 {
+  ret void
+}
+
+define spir_kernel void @Dim3() !max_work_group_size !2 {
+  ret void
+}
+
+!0 = !{i32 4}
+!1 = !{i32 8, i32 4}
+!2 = !{i32 16, i32 8, i32 4}

From bdf02486a3b148ef4302d6948fd8eb76bd037a04 Mon Sep 17 00:00:00 2001
From: Manuel Carrasco <Manuel.Carrasco@amd.com>
Date: Tue, 4 Nov 2025 14:38:14 +0000
Subject: [PATCH 169/313] [clang][Driver] Fix crash in --offload-new-driver and
 -save-temps. (#165606)

`clang -x hip foo.c --offload-arch=amdgcnspirv --offload-new-driver
-save-temps` was crashing with the following error:

```
/usr/bin/ld: input file 'foo-x86_64-unknown-linux-gnu.o' is the same as output file
build/bin/clang-linker-wrapper: error: 'ld' failed
```

The `LinkerWrapperJobAction` [is
created](https://github.com/llvm/llvm-project/blob/957598f71bd8baa029d886e59ed9aed60e6e9bb9/clang/lib/Driver/Driver.cpp#L4888)
with `types::TY_Object` which makes `Driver::GetNamedOutputPath` assign
the same name as the assembler's output and thus causing the crash.
---
 clang/lib/Driver/Driver.cpp                         | 13 ++++++++++---
 clang/test/Driver/hip-spirv-translator-new-driver.c |  9 +++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/hip-spirv-translator-new-driver.c

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 51618d17a4180..6f6a35b4c8c17 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6463,9 +6463,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
               (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC &&
                TC->getTriple().isAMDGPU()));
     };
-    if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC &&
-        (C.getArgs().hasArg(options::OPT_emit_llvm) ||
-         IsAMDRDCInCompilePhase(JA, C.getArgs())))
+
+    // The linker wrapper may not support the input and output files to be the
+    // same one, and without it -save-temps can fail.
+    bool IsLinkerWrapper =
+        JA.getType() == types::TY_Object && isa<LinkerWrapperJobAction>(JA);
+    bool IsEmitBitcode = JA.getType() == types::TY_LLVM_BC &&
+                         (C.getArgs().hasArg(options::OPT_emit_llvm) ||
+                          IsAMDRDCInCompilePhase(JA, C.getArgs()));
+
+    if (!AtTopLevel && (IsLinkerWrapper || IsEmitBitcode))
       Suffixed += ".tmp";
     Suffixed += '.';
     Suffixed += Suffix;
diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
new file mode 100644
index 0000000000000..315a74635b9b3
--- /dev/null
+++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
@@ -0,0 +1,9 @@
+// The --offload-new-driver was crashing when using -save-temps due to a failure in clang-linker-wrapper.
+// The input and output files cannot be the same.
+
+// RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \
+// RUN: --offload-arch=amdgcnspirv -x hip %s 2>&1 \
+// RUN: | FileCheck %s
+
+// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
+// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}

From 718a3b268fcff1f985d9ee04a53fb1da4569416a Mon Sep 17 00:00:00 2001
From: YongKang Zhu <yongzhu@fb.com>
Date: Tue, 4 Nov 2025 06:49:04 -0800
Subject: [PATCH 170/313] [BOLT][AArch64] Run LDR relaxation (#165787)

Replace the current `ADRRelaxationPass` with `AArch64RelaxationPass`,
which, besides the existing ADR relaxation, will also run LDR relaxation
that for now only handles these two forms of LDR instructions:
`ldr Xt, [label]` and `ldr Wt, [label]`.
---
 bolt/include/bolt/Core/MCPlusBuilder.h        |  23 ++++
 ...laxationPass.h => AArch64RelaxationPass.h} |  22 ++--
 bolt/include/bolt/Passes/FixRelaxationPass.h  |   2 +-
 ...tionPass.cpp => AArch64RelaxationPass.cpp} |  47 ++++---
 bolt/lib/Passes/CMakeLists.txt                |   2 +-
 bolt/lib/Rewrite/BinaryPassManager.cpp        |  12 +-
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  42 ++++++
 bolt/test/AArch64/ldr-relaxation.s            | 122 ++++++++++++++++++
 8 files changed, 232 insertions(+), 40 deletions(-)
 rename bolt/include/bolt/Passes/{ADRRelaxationPass.h => AArch64RelaxationPass.h} (51%)
 rename bolt/lib/Passes/{ADRRelaxationPass.cpp => AArch64RelaxationPass.cpp} (67%)
 create mode 100644 bolt/test/AArch64/ldr-relaxation.s

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index d666c10885ad5..5e349cd69fb43 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -840,6 +840,16 @@ class MCPlusBuilder {
     return false;
   }
 
+  virtual bool isLDRWl(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isLDRXl(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
   virtual bool isMOVW(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
@@ -1789,6 +1799,19 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
+  /// Take \p LDRInst and return ADRP+LDR instruction sequence - for
+  ///
+  ///     ldr  x0, [label]
+  ///
+  /// the following sequence will be generated:
+  ///
+  ///     adrp x0, PageBase(label)
+  ///     ldr  x0, [x0, PageOffset(label)]
+  virtual InstructionListType createAdrpLdr(const MCInst &LDRInst,
+                                            MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Return not 0 if the instruction CurInst, in combination with the recent
   /// history of disassembled instructions supplied by [Begin, End), is a linker
   /// generated veneer/stub that needs patching. This happens in AArch64 when
diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/AArch64RelaxationPass.h
similarity index 51%
rename from bolt/include/bolt/Passes/ADRRelaxationPass.h
rename to bolt/include/bolt/Passes/AArch64RelaxationPass.h
index b9f92dec7f03b..b9185a1e34388 100644
--- a/bolt/include/bolt/Passes/ADRRelaxationPass.h
+++ b/bolt/include/bolt/Passes/AArch64RelaxationPass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===//
+//===- bolt/Passes/AArch64RelaxationPass.h ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,29 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the ADRRelaxationPass class, which replaces AArch64
-// non-local ADR instructions with ADRP + ADD due to small offset range of ADR
-// instruction (+- 1MB) which could be easily overflowed after BOLT
-// optimizations. Such problems are usually connected with errata 843419
-// https://developer.arm.com/documentation/epm048406/2100/
+// This file declares the AArch64RelaxationPass class, which replaces AArch64
+// non-local ADR/LDR instructions with ADRP + ADD/LDR due to small offset
+// range of ADR and LDR instruction (+- 1MB) which could be easily overflowed
+// after BOLT optimizations. Such problems are usually connected with errata
+// 843419: https://developer.arm.com/documentation/epm048406/2100/
 // The linker could replace ADRP instruction with ADR in some cases.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BOLT_PASSES_ADRRELAXATIONPASS_H
-#define BOLT_PASSES_ADRRELAXATIONPASS_H
+#ifndef BOLT_PASSES_AARCH64RELAXATIONPASS_H
+#define BOLT_PASSES_AARCH64RELAXATIONPASS_H
 
 #include "bolt/Passes/BinaryPasses.h"
 
 namespace llvm {
 namespace bolt {
 
-class ADRRelaxationPass : public BinaryFunctionPass {
+class AArch64RelaxationPass : public BinaryFunctionPass {
 public:
-  explicit ADRRelaxationPass(const cl::opt<bool> &PrintPass)
+  explicit AArch64RelaxationPass(const cl::opt<bool> &PrintPass)
       : BinaryFunctionPass(PrintPass) {}
 
-  const char *getName() const override { return "adr-relaxation"; }
+  const char *getName() const override { return "aarch64-relaxation"; }
 
   /// Pass entry point
   Error runOnFunctions(BinaryContext &BC) override;
diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h
index 50b64480aa62e..cf5a8a1fcb134 100644
--- a/bolt/include/bolt/Passes/FixRelaxationPass.h
+++ b/bolt/include/bolt/Passes/FixRelaxationPass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===//
+//===- bolt/Passes/FixRelaxationPass.h --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp
similarity index 67%
rename from bolt/lib/Passes/ADRRelaxationPass.cpp
rename to bolt/lib/Passes/AArch64RelaxationPass.cpp
index c3954c94a7f92..610adad58cfcb 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.cpp ----------------------------------===//
+//===- bolt/Passes/AArch64RelaxationPass.cpp ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ADRRelaxationPass class.
+// This file implements the AArch64RelaxationPass class.
 //
 //===----------------------------------------------------------------------===//
 
-#include "bolt/Passes/ADRRelaxationPass.h"
+#include "bolt/Passes/AArch64RelaxationPass.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Utils/CommandLineOpts.h"
 #include <iterator>
@@ -20,10 +20,10 @@ using namespace llvm;
 namespace opts {
 extern cl::OptionCategory BoltCategory;
 
-static cl::opt<bool>
-    AdrPassOpt("adr-relaxation",
-               cl::desc("Replace ARM non-local ADR instructions with ADRP"),
-               cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
+static cl::opt<bool> AArch64PassOpt(
+    "aarch64-relaxation",
+    cl::desc("Replace ARM non-local ADR/LDR instructions with ADRP"),
+    cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
 } // namespace opts
 
 namespace llvm {
@@ -35,7 +35,7 @@ namespace bolt {
 // jobs and checking the exit flag after it.
 static bool PassFailed = false;
 
-void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
+void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) {
   if (PassFailed)
     return;
 
@@ -43,10 +43,13 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
   for (BinaryBasicBlock &BB : BF) {
     for (auto It = BB.begin(); It != BB.end(); ++It) {
       MCInst &Inst = *It;
-      if (!BC.MIB->isADR(Inst))
+      bool IsADR = BC.MIB->isADR(Inst);
+
+      // TODO: Handle other types of LDR (literal, PC-relative) instructions.
+      if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst))
         continue;
 
-      const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst);
+      const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1);
       if (!Symbol)
         continue;
 
@@ -56,25 +59,27 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
           continue;
       }
 
-      // Don't relax ADR if it points to the same function and is in the main
-      // fragment and BF initial size is < 1MB.
+      // Don't relax ADR/LDR if it points to the same function and is in the
+      // main fragment and BF initial size is < 1MB.
       const unsigned OneMB = 0x100000;
       if (BF.getSize() < OneMB) {
         BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol);
         if (TargetBF == &BF && !BB.isSplit())
           continue;
 
-        // No relaxation needed if ADR references a basic block in the same
+        // No relaxation needed if ADR/LDR references a basic block in the same
         // fragment.
         if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol))
           if (BB.getFragmentNum() == TargetBB->getFragmentNum())
             continue;
       }
 
-      InstructionListType AdrpAdd;
+      InstructionListType AdrpMaterialization;
       {
         auto L = BC.scopeLock();
-        AdrpAdd = BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get());
+        AdrpMaterialization =
+            IsADR ? BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get())
+                  : BC.MIB->createAdrpLdr(Inst, BC.Ctx.get());
       }
 
       if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) {
@@ -88,18 +93,18 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
         // invalidate this offset, so we have to rely on linker-inserted NOP to
         // replace it with ADRP, and abort if it is not present.
         auto L = BC.scopeLock();
-        BC.errs() << "BOLT-ERROR: cannot relax ADR in non-simple function "
-                  << BF << '\n';
+        BC.errs() << "BOLT-ERROR: cannot relax " << (IsADR ? "ADR" : "LDR")
+                  << " in non-simple function " << BF << '\n';
         PassFailed = true;
         return;
       }
-      It = BB.replaceInstruction(It, AdrpAdd);
+      It = BB.replaceInstruction(It, AdrpMaterialization);
     }
   }
 }
 
-Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
-  if (!opts::AdrPassOpt || !BC.HasRelocations)
+Error AArch64RelaxationPass::runOnFunctions(BinaryContext &BC) {
+  if (!opts::AArch64PassOpt || !BC.HasRelocations)
     return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
@@ -108,7 +113,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
 
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
-      "ADRRelaxationPass");
+      "AArch64RelaxationPass");
 
   if (PassFailed)
     return createFatalBOLTError("");
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index d7519518f186f..3197e62faad21 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_llvm_library(LLVMBOLTPasses
-  ADRRelaxationPass.cpp
+  AArch64RelaxationPass.cpp
   Aligner.cpp
   AllocCombiner.cpp
   AsmDump.cpp
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 782137e807662..1a0f6d75d63e8 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Rewrite/BinaryPassManager.h"
-#include "bolt/Passes/ADRRelaxationPass.h"
+#include "bolt/Passes/AArch64RelaxationPass.h"
 #include "bolt/Passes/Aligner.h"
 #include "bolt/Passes/AllocCombiner.h"
 #include "bolt/Passes/AsmDump.h"
@@ -129,10 +129,10 @@ static cl::opt<bool> PrintJTFootprintReduction(
     cl::desc("print function after jt-footprint-reduction pass"), cl::Hidden,
     cl::cat(BoltOptCategory));
 
-static cl::opt<bool>
-    PrintAdrRelaxation("print-adr-relaxation",
-                       cl::desc("print functions after ADR Relaxation pass"),
-                       cl::Hidden, cl::cat(BoltOptCategory));
+static cl::opt<bool> PrintAArch64Relaxation(
+    "print-adr-ldr-relaxation",
+    cl::desc("print functions after ADR/LDR Relaxation pass"), cl::Hidden,
+    cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
     PrintLongJmp("print-longjmp",
@@ -517,7 +517,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
 
   if (BC.isAArch64()) {
     Manager.registerPass(
-        std::make_unique<ADRRelaxationPass>(PrintAdrRelaxation));
+        std::make_unique<AArch64RelaxationPass>(PrintAArch64Relaxation));
 
     // Tighten branches according to offset differences between branch and
     // targets. No extra instructions after this pass, otherwise we may have
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 7769162d67eaf..8a496c566b06b 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -142,6 +142,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) {
   atomicAdd(Insts.back(), RegTo, RegTmp);
   return Insts;
 }
+
 class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
   using MCPlusBuilder::MCPlusBuilder;
@@ -583,6 +584,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Inst.getOpcode() == AArch64::ADDXri;
   }
 
+  bool isLDRWl(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::LDRWl;
+  }
+
+  bool isLDRXl(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::LDRXl;
+  }
+
   MCPhysReg getADRReg(const MCInst &Inst) const {
     assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction");
     assert(MCPlus::getNumPrimeOperands(Inst) != 0 &&
@@ -602,6 +611,39 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return materializeAddress(Target, Ctx, Reg, Addend);
   }
 
+  InstructionListType createAdrpLdr(const MCInst &LDRInst,
+                                    MCContext *Ctx) const override {
+    assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) &&
+           "LDR (literal, 32 or 64-bit integer load) instruction expected");
+    assert(LDRInst.getOperand(0).isReg() &&
+           "unexpected operand in LDR instruction");
+    const MCPhysReg DataReg = LDRInst.getOperand(0).getReg();
+    const MCPhysReg AddrReg =
+        isLDRXl(LDRInst) ? DataReg
+                         : (MCPhysReg)RegInfo->getMatchingSuperReg(
+                               DataReg, AArch64::sub_32,
+                               &RegInfo->getRegClass(AArch64::GPR64RegClassID));
+    const MCSymbol *Target = getTargetSymbol(LDRInst, 1);
+    assert(Target && "missing target symbol in LDR instruction");
+
+    InstructionListType Insts(2);
+    Insts[0].setOpcode(AArch64::ADRP);
+    Insts[0].clear();
+    Insts[0].addOperand(MCOperand::createReg(AddrReg));
+    Insts[0].addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx,
+                          ELF::R_AARCH64_NONE);
+    Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui);
+    Insts[1].clear();
+    Insts[1].addOperand(MCOperand::createReg(DataReg));
+    Insts[1].addOperand(MCOperand::createReg(AddrReg));
+    Insts[1].addOperand(MCOperand::createImm(0));
+    Insts[1].addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx,
+                          ELF::R_AARCH64_ADD_ABS_LO12_NC);
+    return Insts;
+  }
+
   bool isTB(const MCInst &Inst) const {
     return (Inst.getOpcode() == AArch64::TBNZW ||
             Inst.getOpcode() == AArch64::TBNZX ||
diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s
new file mode 100644
index 0000000000000..7632504a01635
--- /dev/null
+++ b/bolt/test/AArch64/ldr-relaxation.s
@@ -0,0 +1,122 @@
+## Check that LDR relaxation will fail since LDR is inside a non-simple
+## function and there is no NOP next to it.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym FAIL=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: not llvm-bolt %t.so -o %t.bolt 2>&1 | FileCheck %s --check-prefix=FAIL
+
+# FAIL: BOLT-ERROR: cannot relax LDR in non-simple function _start
+
+.ifdef FAIL
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  br x2
+  ldr x0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is not needed since the reference is not far away.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym NOT_NEEDED=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=NOT_NEEDED
+
+# NOT_NEEDED: <_start>
+# NOT_NEEDED-NEXT: ldr
+
+.ifdef NOT_NEEDED
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr x0, _start
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is done in a simple function, where NOP will
+## be inserted as needed.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_SIMPLE=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX
+
+# RELAX: adrp
+# RELAX-NEXT: ldr
+
+.ifdef RELAX_SIMPLE
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr x0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is done in a non-simple function, where NOP
+## exists next to LDR.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_NON_SIMPLE=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX
+
+.ifdef RELAX_NON_SIMPLE
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  br x2
+  ldr x0, _foo
+  nop
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check LDR relaxation works on loading W (low 32-bit of X) registers.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_SIMPLE_WREG=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAXW
+
+# RELAXW: adrp x0
+# RELAXW-NEXT: ldr w0
+
+.ifdef RELAX_SIMPLE_WREG
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr w0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+  .section .text_cold
+  .global _foo
+  .align 3
+_foo:
+  .long 0x12345678
+.size _foo, .-_foo

From b25868157df732208ced87e1e4703311f6011e61 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Tue, 4 Nov 2025 09:53:45 -0500
Subject: [PATCH 171/313] [NFC] add LLVM_ABI to function getMemcmp declaration
 (#166192)

According to discussion of
https://github.com/llvm/llvm-project/pull/153600#discussion_r2356071934

add LLVM_ABI to function getMemcmp declaration
---
 llvm/include/llvm/CodeGen/SelectionDAG.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 1a5ffb38f2568..0dd4f23c6d85f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1260,9 +1260,15 @@ class SelectionDAG {
   /// stack arguments from being clobbered.
   LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain);
 
-  std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
-                                        SDValue Dst, SDValue Src, SDValue Size,
-                                        const CallInst *CI);
+  /// Lower a memcmp operation into a target library call and return the
+  /// resulting chain and call result as SelectionDAG SDValues.
+  LLVM_ABI std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
+                                                 SDValue Dst, SDValue Src,
+                                                 SDValue Size,
+                                                 const CallInst *CI);
+
+  /// Lower a strlen operation into a target library call and return the
+  /// resulting chain and call result as SelectionDAG SDValues.
   LLVM_ABI std::pair<SDValue, SDValue>
   getStrlen(SDValue Chain, const SDLoc &dl, SDValue Src, const CallInst *CI);
 

From 3170345e20b77a917a61d02119986a53cbe08486 Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Tue, 4 Nov 2025 17:05:58 +0200
Subject: [PATCH 172/313] [X86] Fix LEA compression on 64 bit (#166334)

NDD ADD is only supported on 64 bit, but `LEA32` has
`Requires<[Not64BitMode]>`. The reason it doesnt fail upstream is that
the predicates check is commented out on `X86MCInstLower.cpp`:

```
  // FIXME: Enable feature predicate checks once all the test pass.
  // X86_MC::verifyInstructionPredicates(MI->getOpcode(),
  //                                     Subtarget->getFeatureBits());

```

Introduced by: https://github.com/llvm/llvm-project/pull/158254
---
 llvm/lib/Target/X86/X86CompressEVEX.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index c0c7f5adf06ef..ddbd10d8f7eda 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
       const MachineOperand &Src2 = MI.getOperand(2);
       bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
       const MCInstrDesc &NewDesc =
-          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r);
+          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
       if (Is32BitReg)
         Src1 = getX86SubSuperRegister(Src1, 64);
       MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst)

From 5ba746d9a0f7850abbbd5d6f7c5fabcfe0a22e45 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 4 Nov 2025 07:10:53 -0800
Subject: [PATCH 173/313] [NFC][TableGen] Use namespace qualifier to define
 `RecordKeeperImpl` (#166220)

---
 llvm/lib/TableGen/Record.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index afce803f3f568..8ad20b45f5e16 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -46,12 +46,11 @@ using namespace llvm;
 //    Context
 //===----------------------------------------------------------------------===//
 
-namespace llvm::detail {
 /// This class represents the internal implementation of the RecordKeeper.
 /// It contains all of the contextual static state of the Record classes. It is
 /// kept out-of-line to simplify dependencies, and also make it easier for
 /// internal classes to access the uniquer state of the keeper.
-struct RecordKeeperImpl {
+struct detail::RecordKeeperImpl {
   RecordKeeperImpl(RecordKeeper &RK)
       : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK),
         SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK),
@@ -99,7 +98,6 @@ struct RecordKeeperImpl {
 
   void dumpAllocationStats(raw_ostream &OS) const;
 };
-} // namespace llvm::detail
 
 void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
   // Dump memory allocation related stats.

From a2495ff991bdad98abca022e89eb93d5fb13a915 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 4 Nov 2025 07:11:26 -0800
Subject: [PATCH 174/313] [NFC][TableGen] Emit empty lines after/before
 namespace scope (#166217)

Emit empty line after a namespace scope is opened and before its closed.
Adjust DirectiveEmitter code empty line emission in response to this to
avoid lot of unit test changes.
---
 llvm/include/llvm/TableGen/CodeGenHelpers.h   |  5 ++--
 llvm/test/TableGen/directive1.td              |  2 ++
 llvm/test/TableGen/directive2.td              |  3 +++
 .../utils/TableGen/Basic/DirectiveEmitter.cpp | 25 ++++++-------------
 mlir/test/mlir-tblgen/cpp-class-comments.td   |  4 +++
 5 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index e22c6d4f6d390..95866e306b5ff 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -20,6 +20,7 @@
 #include <string>
 
 namespace llvm {
+
 // Simple RAII helper for emitting ifdef-undef-endif scope.
 class IfDefEmitter {
 public:
@@ -57,7 +58,7 @@ class NamespaceEmitter {
   NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed)
       : Name(trim(NameUntrimmed).str()), OS(OS) {
     if (!Name.empty())
-      OS << "namespace " << Name << " {\n";
+      OS << "namespace " << Name << " {\n\n";
   }
 
   ~NamespaceEmitter() { close(); }
@@ -65,7 +66,7 @@ class NamespaceEmitter {
   // Explicit function to close the namespace scopes.
   void close() {
     if (!Closed && !Name.empty())
-      OS << "} // namespace " << Name << "\n";
+      OS << "\n} // namespace " << Name << "\n";
     Closed = true;
   }
 
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 475faf9254157..8648651f3d714 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -61,6 +61,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #include <utility>
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace tdl {
 // CHECK-EMPTY:
 // CHECK-NEXT:  LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
@@ -176,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index ccc09446b4465..96022d7647440 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -54,6 +54,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #include <utility>
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace tdl {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
@@ -132,6 +133,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI Association getDirectiveAssociation(Directive D);
 // CHECK-NEXT:  LLVM_ABI Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace tdl
 // CHECK-EMPTY:
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
@@ -149,6 +151,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index 3c6ff1132230b..d33bf45595e2e 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -81,6 +81,7 @@ static void generateEnumExports(ArrayRef<const Record *> Records,
     std::string N = getIdentifierName(R, Prefix);
     OS << "constexpr auto " << N << " = " << Enum << "::" << N << ";\n";
   }
+  OS << "\n";
 }
 
 // Generate enum class. Entries are emitted in the order in which they appear
@@ -88,7 +89,6 @@ static void generateEnumExports(ArrayRef<const Record *> Records,
 static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
                               StringRef Enum, StringRef Prefix,
                               bool ExportEnums) {
-  OS << "\n";
   OS << "enum class " << Enum << " {\n";
   if (!Records.empty()) {
     std::string N;
@@ -104,17 +104,15 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
   OS << "};\n";
   OS << "\n";
   OS << "static constexpr std::size_t " << Enum
-     << "_enumSize = " << Records.size() << ";\n";
+     << "_enumSize = " << Records.size() << ";\n\n";
 
   // Make the enum values available in the defined namespace. This allows us to
   // write something like Enum_X if we have a `using namespace <CppNamespace>`.
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (ExportEnums) {
-    OS << "\n";
+  if (ExportEnums)
     generateEnumExports(Records, OS, Enum, Prefix);
-  }
 }
 
 // Generate enum class with values corresponding to different bit positions.
@@ -127,7 +125,6 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records,
   StringRef Type = Records.size() <= 32 ? "uint32_t" : "uint64_t";
   StringRef TypeSuffix = Records.size() <= 32 ? "U" : "ULL";
 
-  OS << "\n";
   OS << "enum class " << Enum << " : " << Type << " {\n";
   std::string LastName;
   for (auto [I, R] : llvm::enumerate(Records)) {
@@ -138,17 +135,15 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records,
   OS << "};\n";
   OS << "\n";
   OS << "static constexpr std::size_t " << Enum
-     << "_enumSize = " << Records.size() << ";\n";
+     << "_enumSize = " << Records.size() << ";\n\n";
 
   // Make the enum values available in the defined namespace. This allows us to
   // write something like Enum_X if we have a `using namespace <CppNamespace>`.
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (ExportEnums) {
-    OS << "\n";
+  if (ExportEnums)
     generateEnumExports(Records, OS, Enum, Prefix);
-  }
 }
 
 // Generate enums for values that clauses can take.
@@ -170,7 +165,6 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records,
       return;
     }
 
-    OS << "\n";
     OS << "enum class " << Enum << " {\n";
     for (const EnumVal Val : ClauseVals)
       OS << "  " << Val.getRecordName() << "=" << Val.getValue() << ",\n";
@@ -182,6 +176,7 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records,
         OS << "constexpr auto " << CV->getName() << " = " << Enum
            << "::" << CV->getName() << ";\n";
       }
+      OS << "\n";
       EnumHelperFuncs += (Twine("LLVM_ABI ") + Twine(Enum) + Twine(" get") +
                           Twine(Enum) + Twine("(StringRef Str);\n"))
                              .str();
@@ -284,7 +279,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   NamespaceEmitter DirLangNS(OS, DirLang.getCppNamespace());
 
   if (DirLang.hasEnableBitmaskEnumInNamespace())
-    OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n";
+    OS << "LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n\n";
 
   // Emit Directive associations
   std::vector<const Record *> Associations;
@@ -315,7 +310,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   generateClauseEnumVal(DirLang.getClauses(), OS, DirLang, EnumHelperFuncs);
 
   // Generic function signatures
-  OS << "\n";
   OS << "// Enumeration helper functions\n";
 
   OS << "LLVM_ABI std::pair<Directive, directive::VersionRange> get" << Lang
@@ -353,10 +347,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   OS << "LLVM_ABI Association getDirectiveAssociation(Directive D);\n";
   OS << "LLVM_ABI Category getDirectiveCategory(Directive D);\n";
   OS << "LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);\n";
-  if (EnumHelperFuncs.length() > 0) {
-    OS << EnumHelperFuncs;
-    OS << "\n";
-  }
+  OS << EnumHelperFuncs;
 
   DirLangNS.close();
 
diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td
index 9dcf975e45286..0d3445d6647af 100644
--- a/mlir/test/mlir-tblgen/cpp-class-comments.td
+++ b/mlir/test/mlir-tblgen/cpp-class-comments.td
@@ -36,6 +36,7 @@ def A_SomeOp1 : Op<A_Dialect, "some_op1", []>{
 
   let cppNamespace = "OP1";
 // OP: namespace OP1
+// OP-EMPTY:
 // OP-NEXT: /// Some Op1 summary line1
 // OP-NEXT: /// summary line2
 // OP-NEXT: /// Some Op1 description
@@ -97,6 +98,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
   let methods = [
   ];
 // ATTR-INTERFACE: namespace mlir::a::traits {
+// ATTR-INTERFACE-EMPTY:
 // ATTR-INTERFACE-NEXT: /// Common trait for all layouts.
 // ATTR-INTERFACE-NEXT: class EncodingTrait;
 }
@@ -104,6 +106,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
 def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> {
   let cppNamespace = "a::traits";
 // ATTR-INTERFACE: namespace a::traits {
+// ATTR-INTERFACE-EMPTY:
 // ATTR-INTERFACE-NEXT: class SimpleEncodingTrait;
 }
 
@@ -114,6 +117,7 @@ def SimpleOpInterface : OpInterface<"SimpleOpInterface"> {
     Simple Op Interface description
     }];
 // OP-INTERFACE: namespace a::traits {
+// OP-INTERFACE-EMPTY:
 // OP-INTERFACE-NEXT: /// Simple Op Interface description
 // OP-INTERFACE-NEXT: class SimpleOpInterface;
 }

From c2269c842d28c28c87d41c4afc7b858038e73ad4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 4 Nov 2025 07:12:29 -0800
Subject: [PATCH 175/313] [ADT] Move llvm::to_address to STLForwardCompat.h
 (NFC) (#166315)

This patch moves llvm::to_address to STLForwardCompat.h, a collection
of backports from C++20 and beyond.
---
 llvm/include/llvm/ADT/STLExtras.h        | 10 ----------
 llvm/include/llvm/ADT/STLForwardCompat.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 8de8eb5b86640..af0e4a36be1b1 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -2600,16 +2600,6 @@ bool hasNItemsOrLess(ContainerTy &&C, unsigned N) {
   return hasNItemsOrLess(adl_begin(C), adl_end(C), N);
 }
 
-/// Returns a raw pointer that represents the same address as the argument.
-///
-/// This implementation can be removed once we move to C++20 where it's defined
-/// as std::to_address().
-///
-/// The std::pointer_traits<>::to_address(p) variations of these overloads has
-/// not been implemented.
-template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
-template <class T> constexpr T *to_address(T *P) { return P; }
-
 // Detect incomplete types, relying on the fact that their size is unknown.
 namespace detail {
 template <typename T> using has_sizeof = decltype(sizeof(T));
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index e02694f043fbb..ad94cdede9288 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -134,6 +134,16 @@ struct identity // NOLINT(readability-identifier-naming)
   }
 };
 
+/// Returns a raw pointer that represents the same address as the argument.
+///
+/// This implementation can be removed once we move to C++20 where it's defined
+/// as std::to_address().
+///
+/// The std::pointer_traits<>::to_address(p) variations of these overloads has
+/// not been implemented.
+template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
+template <class T> constexpr T *to_address(T *P) { return P; }
+
 //===----------------------------------------------------------------------===//
 //     Features from C++23
 //===----------------------------------------------------------------------===//

From 370058777be2282fe18d62315adafdda3960d012 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 4 Nov 2025 07:12:37 -0800
Subject: [PATCH 176/313] [BinaryFormat] Remove redundant declarations (NFC)
 (#166316)

In C++17, static constexpr members are implicitly inline, so they no
longer require an out-of-line definition.

Once we remove the redundant declarations, Minidump.cpp becomes
effectively empty, so this patch removes the file.

Identified with readability-redundant-declaration.
---
 llvm/lib/BinaryFormat/CMakeLists.txt |  1 -
 llvm/lib/BinaryFormat/Minidump.cpp   | 14 --------------
 2 files changed, 15 deletions(-)
 delete mode 100644 llvm/lib/BinaryFormat/Minidump.cpp

diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 4b2debb7ae236..0c8af1e7a4565 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat
   ELF.cpp
   MachO.cpp
   Magic.cpp
-  Minidump.cpp
   MsgPackDocument.cpp
   MsgPackDocumentYAML.cpp
   MsgPackReader.cpp
diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/llvm/lib/BinaryFormat/Minidump.cpp
deleted file mode 100644
index b618fb1570126..0000000000000
--- a/llvm/lib/BinaryFormat/Minidump.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/BinaryFormat/Minidump.h"
-
-using namespace llvm::minidump;
-
-constexpr uint32_t Header::MagicSignature;
-constexpr uint16_t Header::MagicVersion;

From 502742b5386836f152bb4642c0505274ef08c2d6 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 4 Nov 2025 07:12:45 -0800
Subject: [PATCH 177/313] [llvm] Proofread MergeFunctions.rst (#166317)

---
 llvm/docs/MergeFunctions.rst | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst
index d43b9c3a89091..d64c846687bae 100644
--- a/llvm/docs/MergeFunctions.rst
+++ b/llvm/docs/MergeFunctions.rst
@@ -8,9 +8,9 @@ MergeFunctions pass, how it works
 Introduction
 ============
 Sometimes code contains equal functions, or functions that do exactly the same
-thing even though they are non-equal on the IR level (e.g.: multiplication on 2
-and 'shl 1'). This can happen for several reasons: mainly, the usage of
-templates and automatic code generators. Though, sometimes the user itself could
+thing even though they are non-equal on the IR level (e.g.,: multiplication on 2
+and ``shl 1``). This can happen for several reasons: mainly, the usage of
+templates and automatic code generators. However, sometimes the user itself could
 write the same thing twice :-)
 
 The main purpose of this pass is to recognize such functions and merge them.
@@ -20,21 +20,21 @@ describes the algorithm used to compare functions and
 explains how we could combine equal functions correctly to keep the module
 valid.
 
-Material is brought in a top-down form, so the reader could start to learn pass
+The material is presented in a top-down form, so the reader could start to learn pass
 from high level ideas and end with low-level algorithm details, thus preparing
 him or her for reading the sources.
 
 The main goal is to describe the algorithm and logic here and the concept. If
 you *don't want* to read the source code, but want to understand pass
 algorithms, this document is good for you. The author tries not to repeat the
-source-code and covers only common cases to avoid the cases of needing to
+source code and covers only common cases to avoid the cases of needing to
 update this document after any minor code changes.
 
 
 What should I know to be able to follow along with this document?
 -----------------------------------------------------------------
 
-The reader should be familiar with common compile-engineering principles and
+The reader should be familiar with common compiler-engineering principles and
 LLVM code fundamentals. In this article, we assume the reader is familiar with
 `Single Static Assignment
 <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
@@ -99,7 +99,7 @@ and a ``void*`` as equal.
 This is just an example; more possible details are described a bit below.
 
 As another example, the reader may imagine two more functions. The first
-function performs a multiplication by 2, while the second one performs an
+function performs a multiplication by 2, while the second one performs a
 logical left shift by 1.
 
 Possible solutions
@@ -131,7 +131,7 @@ access lookup? The answer is: "yes".
 Random-access
 """""""""""""
 How can this be done? Just convert each function to a number, and gather
-all of them in a special hash-table. Functions with equal hashes are equal.
+all of them in a special hash table. Functions with equal hashes are equal.
 Good hashing means, that every function part must be taken into account. That
 means we have to convert every function part into some number, and then add it
 into the hash. The lookup-up time would be small, but such an approach adds some
@@ -175,7 +175,7 @@ merged with each other. It is defined as:
 
 ``std::set<FunctionNode> FnTree;``
 
-Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with
+Here, ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with an
 implemented “<” operator among the functions set (below we explain how it works
 exactly; this is a key point in fast functions comparison).
 
@@ -207,7 +207,7 @@ from method.
 Comparison and logarithmical search
 """""""""""""""""""""""""""""""""""
 Let's recall our task: for every function *F* from module *M*, we have to find
-equal functions *F`* in the shortest time possible , and merge them into a
+equal functions *F`* in the shortest time possible and merge them into a
 single function.
 
 Defining total ordering among the functions set allows us to organize
@@ -225,7 +225,7 @@ possible values:
 
 1, left is *greater* than right.
 
-Of course it means, that we have to maintain
+Of course, it means that we have to maintain
 *strict and non-strict order relation properties*:
 
 * reflexivity (``a <= a``, ``a == a``, ``a >= a``),
@@ -235,7 +235,7 @@ Of course it means, that we have to maintain
 
 As mentioned before, the comparison routine consists of
 "sub-comparison-routines", with each of them also consisting of
-"sub-comparison-routines", and so on. Finally, it ends up with primitive
+"sub-comparison-routines", and so on. Finally, it ends up with a primitive
 comparison.
 
 Below, we will use the following operations:
@@ -275,7 +275,7 @@ A brief look at the source code tells us that the comparison starts in the
 “``int FunctionComparator::compare(void)``” method.
 
 1. The first parts to be compared are the function's attributes and some
-properties that is outside the “attributes” term, but still could make the
+properties that are outside the “attributes” term, but still could make the
 function different without changing its body. This part of the comparison is
 usually done within simple *cmpNumbers* or *cmpFlags* operations (e.g.
 ``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is a full list of function's
@@ -365,7 +365,7 @@ comparing them as numbers.
 7. Complex types (structures, arrays, etc.). Follow complex objects comparison
 technique (see the very first paragraph of this chapter). Both *left* and
 *right* are to be expanded and their element types will be checked the same
-way. If we get -1 or 1 on some stage, return it. Otherwise return 0.
+way. If we get -1 or 1 on some stage, return it. Otherwise, return 0.
 
 8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't
 get any conclusions, then invoke ``llvm_unreachable``, since it's quite an
@@ -445,7 +445,7 @@ How to implement cmpValues?
 but, in general, we need to implement antisymmetric relation. As mentioned
 above, to understand what is *less*, we can use order in which we
 meet values. If both values have the same order in a function (met at the same
-time), we then treat values as *associated*. Otherwise – it depends on who was
+time), we then treat values as *associated*. Otherwise, it depends on who was
 first.
 
 Every time we run the top-level compare method, we initialize two identical
@@ -623,7 +623,7 @@ to use ``accumulateConstantOffset`` method.
 So, if we get constant offset for both left and right *GEPs*, then compare it as
 numbers, and return comparison result.
 
-Otherwise treat it like a regular operation (see previous paragraph).
+Otherwise, treat it like a regular operation (see previous paragraph).
 
 cmpOperation
 ------------
@@ -742,7 +742,7 @@ We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace
   referenced anywhere,
 * function should come with external, local or weak linkage.
 
-Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*,
+Otherwise, we write thunk: some wrapper that has *G's* interface and calls *F*,
 so *G* could be replaced with this wrapper.
 
 *writeAlias*
@@ -772,7 +772,7 @@ As it written in method comments:
 “Replace G with a simple tail call to bitcast(F). Also replace direct uses of G
 with bitcast(F). Deletes G.”
 
-In general it does the same as usual when we want to replace callee, except the
+In general, it does the same as usual when we want to replace callee, except the
 first point:
 
 1. We generate tail call wrapper around *F*, but with an interface that allows using

From 50faea28fb93c5938391fdc0a2cfd70b28280537 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 4 Nov 2025 07:12:53 -0800
Subject: [PATCH 178/313] [llvm] Use conventional enum declarations (NFC)
 (#166318)

This patch replaces:

  using Foo = enum { A, B, C };

with the more conventional:

  enum Foo { A, B, C };

These two enum declaration styles are not identical, but their
difference does not matter in these .cpp files.  With the "using Foo"
style, the enum is unnamed and cannot be forward-declared, whereas the
conventional style creates a named enum that can be.  Since these
changes are confined to .cpp files, this distinction has no practical
impact here.
---
 llvm/lib/Target/AArch64/AArch64FastISel.cpp         | 5 +----
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp      | 2 +-
 llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp           | 2 +-
 llvm/lib/Target/ARM/ARMFastISel.cpp                 | 2 +-
 llvm/lib/Target/Mips/MipsFastISel.cpp               | 2 +-
 llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp | 2 +-
 llvm/lib/Target/X86/X86VZeroUpper.cpp               | 2 +-
 llvm/unittests/ADT/FallibleIteratorTest.cpp         | 4 ++--
 8 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index cf344980cbaae..18e246e5af57d 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -81,10 +81,7 @@ namespace {
 class AArch64FastISel final : public FastISel {
   class Address {
   public:
-    using BaseKind = enum {
-      RegBase,
-      FrameIndexBase
-    };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 52cc4ca5a955c..1a14629fb66b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
 
 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce6904ce4d..1682abbdea169 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ class GCNNSAReassignImpl {
   bool run(MachineFunction &MF);
 
 private:
-  using NSA_Status = enum {
+  enum NSA_Status {
     NOT_NSA,        // Not an NSA instruction
     FIXED,          // NSA which we cannot modify
     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 14e1160e70dae..88d3b6f7d5bb9 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -86,7 +86,7 @@ namespace {
   // All possible address modes, plus some.
 class Address {
 public:
-  using BaseKind = enum { RegBase, FrameIndexBase };
+  enum BaseKind { RegBase, FrameIndexBase };
 
 private:
   BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index df0c8c13fa38d..06210b6b91b93 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel {
   // All possible address modes.
   class Address {
   public:
-    using BaseKind = enum { RegBase, FrameIndexBase };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 2666342d0c7b9..66ed8b078b808 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel {
   // All possible address modes.
   class Address {
   public:
-    using BaseKind = enum { RegBase, FrameIndexBase };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index f6f7e92d98578..2f28ab36aa193 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -66,7 +66,7 @@ namespace {
                           MachineBasicBlock &MBB);
     void addDirtySuccessor(MachineBasicBlock &MBB);
 
-    using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+    enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
 
     static const char* getBlockExitStateName(BlockExitState ST);
 
diff --git a/llvm/unittests/ADT/FallibleIteratorTest.cpp b/llvm/unittests/ADT/FallibleIteratorTest.cpp
index d3389744ffbfe..c17aa0393dfcb 100644
--- a/llvm/unittests/ADT/FallibleIteratorTest.cpp
+++ b/llvm/unittests/ADT/FallibleIteratorTest.cpp
@@ -19,8 +19,8 @@ using namespace llvm;
 
 namespace {
 
-using ItemValid = enum { ValidItem, InvalidItem };
-using LinkValid = enum { ValidLink, InvalidLink };
+enum ItemValid { ValidItem, InvalidItem };
+enum LinkValid { ValidLink, InvalidLink };
 
 class Item {
 public:

From bcb3d2f5122276ed9969fe2b2ef4428652800377 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Tue, 4 Nov 2025 16:41:30 +0100
Subject: [PATCH 179/313] build_llvm_release.bat fixes (#166385)

Some followups after #131687 switched to the "runtimes build".

- The `check-sanitizer` build target doesn't exist in the runtimes build; use `check-runtimes` instead.
- ASan is not supported on 32-bit windows. Pass `-DCOMPILER_RT_BUILD_SANITIZERS=OFF`
- `check-runtimes` includes the orcjit tests, which never passed on windows; build with `-DCOMPILER_RT_BUILD_ORC=OFF`
- Various asan and libfuzzer tests fail; suppress them with `LIT_FILTER_OUT`
---
 llvm/utils/release/build_llvm_release.bat | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 001339f2a8f05..0764c7af86c0a 100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -1,6 +1,9 @@
 @echo off
-setlocal enabledelayedexpansion
 
+REM Filter out tests that are known to fail.
+set "LIT_FILTER_OUT=gh110231.cpp|crt_initializers.cpp|init-order-atexit.cpp|use_after_return_linkage.cpp|initialization-bug.cpp|initialization-bug-no-global.cpp|trace-malloc-unbalanced.test|trace-malloc-2.test|TraceMallocTest"
+
+setlocal enabledelayedexpansion
 goto begin
 
 :usage
@@ -24,6 +27,7 @@ echo.
 echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64
 exit /b 1
 
+
 :begin
 
 ::==============================================================================
@@ -163,7 +167,8 @@ set common_cmake_flags=^
   -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
   -DLLVM_ENABLE_RPMALLOC=ON ^
   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp"
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" ^
+  -DCOMPILER_RT_BUILD_ORC=OFF
 
 if "%force-msvc%" == "" (
   where /q clang-cl
@@ -215,6 +220,7 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
 set cmake_flags=^
   %common_cmake_flags% ^
   -DLLVM_ENABLE_RPMALLOC=OFF ^
+  -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^
   -DPython3_ROOT_DIR=%PYTHONHOME% ^
   -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
   -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
@@ -224,7 +230,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 cd..
 
@@ -233,6 +239,7 @@ REM with forward slash.
 set all_cmake_flags=^
   %cmake_flags% ^
   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^
+  -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^
   %common_lldb_flags% ^
   -DPYTHON_HOME=%PYTHONHOME% ^
   -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
@@ -249,7 +256,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja package || exit /b 1
 cd ..

From 2286118e6f2cda56b78d2e6b0193dd6f0ca7b7ea Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 4 Nov 2025 18:10:26 +0200
Subject: [PATCH 180/313] [SPIRV] Enable `bfloat16` arithmetic (#166031)

Enable the `SPV_INTEL_bfloat16_arithmetic` extension, which allows arithmetic, relational and `OpExtInst` instructions to take `bfloat16` arguments. This patch only adds support to arithmetic and relational ops. The extension itself is rather fresh, but `bfloat16` is ubiquitous at this point and not supporting these ops is limiting.
---
 llvm/docs/SPIRVUsage.rst                      |   2 +
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  16 +-
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    |   2 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |  61 ++-
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |   3 +
 .../bfloat16-arithmetic.ll                    | 142 +++++++
 .../bfloat16-relational.ll                    | 376 ++++++++++++++++++
 7 files changed, 595 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll

diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 99f56a5cbc63a..749961356d23e 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -173,6 +173,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bindless_images``
      - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images.
+   * - ``SPV_INTEL_bfloat16_arithmetic``
+     - Allows the use of 16-bit bfloat16 values in arithmetic and relational operators.
    * - ``SPV_INTEL_bfloat16_conversion``
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
    * - ``SPV_INTEL_cache_controls``
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 1fc90d0852aad..4fd220481cedb 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
   MachinePreds[Edge].push_back(NewPred);
 }
 
+static bool targetSupportsBF16Type(const MachineFunction *MF) {
+  return MF->getTarget().getTargetTriple().isSPIRV();
+}
+
 static bool containsBF16Type(const User &U) {
   // BF16 cannot currently be represented by LLT, to avoid miscompiles we
   // prevent any instructions using them. FIXME: This can be removed once LLT
@@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) {
 
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   // Get or create a virtual register for each value.
@@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
 
 bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
@@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   auto *CI = cast<CmpInst>(&U);
@@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
                                  MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   uint32_t Flags = 0;
@@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
 bool IRTranslator::translateInlineAsm(const CallBase &CB,
                                       MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(CB))
+  if (containsBF16Type(CB) && !targetSupportsBF16Type(MF))
     return false;
 
   const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
@@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
 }
 
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
-  if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   const CallInst &CI = cast<CallInst>(U);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index f0558ebcb6681..43b2869cecdf7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
         {"SPV_INTEL_bindless_images",
          SPIRV::Extension::Extension::SPV_INTEL_bindless_images},
+        {"SPV_INTEL_bfloat16_arithmetic",
+         SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic},
         {"SPV_INTEL_bfloat16_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
         {"SPV_KHR_subgroup_rotate",
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index d154a06c6f313..e5ac76c405841 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI,
       addPrintfRequirements(MI, Reqs, ST);
       break;
     }
+    // TODO: handle bfloat16 extended instructions when
+    // SPV_INTEL_bfloat16_arithmetic is enabled.
     break;
   }
   case SPIRV::OpAliasDomainDeclINTEL:
@@ -2060,7 +2062,64 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL);
     break;
   }
-
+  case SPIRV::OpFAddS:
+  case SPIRV::OpFSubS:
+  case SPIRV::OpFMulS:
+  case SPIRV::OpFDivS:
+  case SPIRV::OpFRemS:
+  case SPIRV::OpFMod:
+  case SPIRV::OpFNegate:
+  case SPIRV::OpFAddV:
+  case SPIRV::OpFSubV:
+  case SPIRV::OpFMulV:
+  case SPIRV::OpFDivV:
+  case SPIRV::OpFRemV:
+  case SPIRV::OpFNegateV: {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg());
+    if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+      TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+    if (isBFloat16Type(TypeDef)) {
+      if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+        report_fatal_error(
+            "Arithmetic instructions with bfloat16 arguments require the "
+            "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+            false);
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+      Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+    }
+    break;
+  }
+  case SPIRV::OpOrdered:
+  case SPIRV::OpUnordered:
+  case SPIRV::OpFOrdEqual:
+  case SPIRV::OpFOrdNotEqual:
+  case SPIRV::OpFOrdLessThan:
+  case SPIRV::OpFOrdLessThanEqual:
+  case SPIRV::OpFOrdGreaterThan:
+  case SPIRV::OpFOrdGreaterThanEqual:
+  case SPIRV::OpFUnordEqual:
+  case SPIRV::OpFUnordNotEqual:
+  case SPIRV::OpFUnordLessThan:
+  case SPIRV::OpFUnordLessThanEqual:
+  case SPIRV::OpFUnordGreaterThan:
+  case SPIRV::OpFUnordGreaterThanEqual: {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg());
+    SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg());
+    if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+      TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+    if (isBFloat16Type(TypeDef)) {
+      if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+        report_fatal_error(
+            "Relational instructions with bfloat16 arguments require the "
+            "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+            false);
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+      Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+    }
+    break;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 267118364c371..1b4b29bbb160a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
 defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
 defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
+defm SPV_INTEL_bfloat16_arithmetic
+    : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom
 defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
+defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>;
 defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>;
 defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
 defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
new file mode 100644
index 0000000000000..4cabddb94df25
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
@@ -0,0 +1,142 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[NEG:%.*]] "neg"
+; CHECK-DAG: OpName [[NEGV:%.*]] "negv"
+; CHECK-DAG: OpName [[ADD:%.*]] "add"
+; CHECK-DAG: OpName [[ADDV:%.*]] "addv"
+; CHECK-DAG: OpName [[SUB:%.*]] "sub"
+; CHECK-DAG: OpName [[SUBV:%.*]] "subv"
+; CHECK-DAG: OpName [[MUL:%.*]] "mul"
+; CHECK-DAG: OpName [[MULV:%.*]] "mulv"
+; CHECK-DAG: OpName [[DIV:%.*]] "div"
+; CHECK-DAG: OpName [[DIVV:%.*]] "divv"
+; CHECK-DAG: OpName [[REM:%.*]] "rem"
+; CHECK-DAG: OpName [[REMV:%.*]] "remv"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4
+
+; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]]
+define spir_func bfloat @neg(bfloat %x) {
+entry:
+  %r = fneg bfloat %x
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]]
+define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) {
+entry:
+  %r = fneg <4 x bfloat> %x
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @add(bfloat %x, bfloat %y) {
+entry:
+  %r = fadd bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fadd <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @sub(bfloat %x, bfloat %y) {
+entry:
+  %r = fsub bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fsub <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @mul(bfloat %x, bfloat %y) {
+entry:
+  %r = fmul bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fmul <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @div(bfloat %x, bfloat %y) {
+entry:
+  %r = fdiv bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fdiv <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @rem(bfloat %x, bfloat %y) {
+entry:
+  %r = frem bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = frem <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
new file mode 100644
index 0000000000000..3774791d58f87
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
@@ -0,0 +1,376 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq"
+; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq"
+; CHECK-DAG: OpName [[UNE:%.*]] "test_une"
+; CHECK-DAG: OpName [[ONE:%.*]] "test_one"
+; CHECK-DAG: OpName [[ULT:%.*]] "test_ult"
+; CHECK-DAG: OpName [[OLT:%.*]] "test_olt"
+; CHECK-DAG: OpName [[ULE:%.*]] "test_ule"
+; CHECK-DAG: OpName [[OLE:%.*]] "test_ole"
+; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt"
+; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt"
+; CHECK-DAG: OpName [[UGE:%.*]] "test_uge"
+; CHECK-DAG: OpName [[OGE:%.*]] "test_oge"
+; CHECK-DAG: OpName [[UNO:%.*]] "test_uno"
+; CHECK-DAG: OpName [[ORD:%.*]] "test_ord"
+; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq"
+; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq"
+; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une"
+; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one"
+; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult"
+; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt"
+; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule"
+; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole"
+; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt"
+; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt"
+; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge"
+; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge"
+; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno"
+; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3
+
+; CHECK:      [[UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ueq(bfloat %a, bfloat %b) {
+  %r = fcmp ueq bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oeq(bfloat %a, bfloat %b) {
+  %r = fcmp oeq bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_une(bfloat %a, bfloat %b) {
+  %r = fcmp une bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_one(bfloat %a, bfloat %b) {
+  %r = fcmp one bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ult(bfloat %a, bfloat %b) {
+  %r = fcmp ult bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_olt(bfloat %a, bfloat %b) {
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ule(bfloat %a, bfloat %b) {
+  %r = fcmp ule bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ole(bfloat %a, bfloat %b) {
+  %r = fcmp ole bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ugt(bfloat %a, bfloat %b) {
+  %r = fcmp ugt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ogt(bfloat %a, bfloat %b) {
+  %r = fcmp ogt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uge(bfloat %a, bfloat %b) {
+  %r = fcmp uge bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oge(bfloat %a, bfloat %b) {
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ord(bfloat %a, bfloat %b) {
+  %r = fcmp ord bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uno(bfloat %a, bfloat %b) {
+  %r = fcmp uno bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[v3UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ueq <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp oeq <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp une <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp one <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ult <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp olt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ule <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ole <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ugt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ogt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp uge <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp oge <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ord <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp uno <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}

From 71022d1ed6f1446fde4ca13f21259c5e550af0f7 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 4 Nov 2025 17:13:23 +0100
Subject: [PATCH 181/313] [Clang] [Docs] Add some CMake example code for
 linking against libclang (#166268)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Though we have a few code examples in our documentation that show how to
*use* libclang, we never actually show how to *link* against it. I
myself mostly figured this out through trial and error some time ago,
and I’ve since had to explain it to others on several occasions, so I
thought adding some very minimal CMake example code might be helpful.
---
 clang/docs/LibClang.rst | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst
index e747022b9c173..6c62bcb5f8c29 100644
--- a/clang/docs/LibClang.rst
+++ b/clang/docs/LibClang.rst
@@ -38,6 +38,7 @@ Code example
 
 .. code-block:: cpp
 
+  // main.cpp
   #include <clang-c/Index.h>
   #include <iostream>
 
@@ -57,6 +58,22 @@ Code example
     CXCursor cursor = clang_getTranslationUnitCursor(unit); //Obtain a cursor at the root of the translation unit
   }
 
+.. code-block:: cmake
+
+  # CMakeLists.txt
+  cmake_minimum_required(VERSION 3.20)
+  project(my_clang_tool VERSION 0.1.0)
+
+  # This will find the default system installation of Clang; if you want to
+  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
+  # to the CMake configure command, where /foobar is the build directory where
+  # you built Clang.
+  find_package(Clang CONFIG REQUIRED)
+
+  add_executable(my_clang_tool main.cpp)
+  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
+  target_link_libraries(my_clang_tool PRIVATE libclang)
+
 Visiting elements of an AST
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The elements of an AST can be recursively visited with pre-order traversal with ``clang_visitChildren``.
@@ -283,6 +300,7 @@ Complete example code
 
 .. code-block:: cpp
 
+  // main.cpp
   #include <clang-c/Index.h>
   #include <iostream>
 
@@ -356,6 +374,21 @@ Complete example code
     );
   }
 
+.. code-block:: cmake
+
+  # CMakeLists.txt
+  cmake_minimum_required(VERSION 3.20)
+  project(my_clang_tool VERSION 0.1.0)
+
+  # This will find the default system installation of Clang; if you want to
+  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
+  # to the CMake configure command, where /foobar is the build directory where
+  # you built Clang.
+  find_package(Clang CONFIG REQUIRED)
+
+  add_executable(my_clang_tool main.cpp)
+  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
+  target_link_libraries(my_clang_tool PRIVATE libclang)
 
 .. _Index.h: https://github.com/llvm/llvm-project/blob/main/clang/include/clang-c/Index.h
 

From 6c563dc6a2127e3f7dd8e957093e57bd3ba35f5b Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Tue, 4 Nov 2025 17:19:47 +0100
Subject: [PATCH 182/313] [mlir][XeGPU] Add optional layout attribute to
 LoadGather StoreScatter ops (#163414)

As [suggested
here](https://github.com/llvm/llvm-project/pull/163071#discussion_r2427229637)
the PR adds an optional layout attribute for `LoadGather` and
`StoreScatter` ops.

For the load-op the attribute describes the layout of the result (ex
`layout_result_0`), and for store-op it describes the layout for the
vector-to-store operand (ex `layout_operand_0`).

The PR also reworks `propagate-layout` pass to consider perm layout
attributes and back-propagate them accordingly.

The helper utility function `getDistributeLayoutAttr` is reworked to
return either `layout_operand/result_0` or `layout` for load/store ops
(denepding on which one is set). After an offline discussion decided
that the overall utilities layouts API is confusing since it tries to
mix permament and temporary layouts. Would need to change it in the
future.

---------

Signed-off-by: dchigarev <dmitry.chigarev@intel.com>
---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 24 +++++--
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |  6 +-
 .../VectorToXeGPU/VectorToXeGPU.cpp           | 12 ++--
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 41 ++++++++++--
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp | 15 +++--
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  | 12 +++-
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 16 ++---
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   | 65 +++++++++++++++++--
 mlir/test/Dialect/XeGPU/propagate-layout.mlir | 40 ++++++++++++
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       |  2 +-
 10 files changed, 200 insertions(+), 33 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 426377fcf598f..689ebd0d1179a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
+      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
+      OptionalAttr<XeGPU_LayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -895,7 +896,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "IntegerAttr": $chunk_size,
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
-                    "xegpu::CachePolicyAttr": $l3_hint)>
+                    "xegpu::CachePolicyAttr": $l3_hint)>,
+    OpBuilder<(ins "Type": $value, "Value": $source,
+                    "ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
+                    "IntegerAttr": $chunk_size,
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
+                    "xegpu::CachePolicyAttr": $l3_hint,
+                    "xegpu::LayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -979,7 +987,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
+      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
+      OptionalAttr<XeGPU_LayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1030,7 +1039,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "IntegerAttr": $chunk_size,
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
-                    "xegpu::CachePolicyAttr": $l3_hint)>
+                    "xegpu::CachePolicyAttr": $l3_hint)>,
+    OpBuilder<(ins "Value": $value, "Value": $dest,
+                    "ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
+                    "IntegerAttr": $chunk_size,
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
+                    "xegpu::CachePolicyAttr": $l3_hint,
+                    "xegpu::LayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 04cfd58d846a7..620a2fe43d682 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -104,11 +104,15 @@ void removeLayoutAttrs(Operation *op);
 
 /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching
 /// it to the owner's dictionary attributes
+/// If `respectPermLayout` is true the existing permament layout
+/// attribute will be kept and assigned to the attribute dict instead
+/// of the provided layout.
 template <typename T,
           typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
                                       std::is_same_v<T, OpResult>>>
 void setDistributeLayoutAttr(const T &operandOrResult,
-                             const DistributeLayoutAttr layout);
+                             const DistributeLayoutAttr layout,
+                             bool respectPermLayout = false);
 
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
 /// operation. If the operation contains regions, it is also applied recursively
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index abea84f6b01fe..1b4d1a42614ea 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -457,7 +457,8 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp,
       /*chunk_size=*/IntegerAttr{},
       /*l1_hint=*/xegpu::CachePolicyAttr{},
       /*l2_hint=*/xegpu::CachePolicyAttr{},
-      /*l3_hint=*/xegpu::CachePolicyAttr{});
+      /*l3_hint=*/xegpu::CachePolicyAttr{},
+      /*layout=*/nullptr);
 
   rewriter.replaceOp(readOp, gatherOp.getResult());
   return success();
@@ -491,7 +492,8 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
                                 /*chunk_size=*/IntegerAttr{},
                                 /*l1_hint=*/xegpu::CachePolicyAttr{},
                                 /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                /*l3_hint=*/xegpu::CachePolicyAttr{});
+                                /*l3_hint=*/xegpu::CachePolicyAttr{},
+                                /*layout=*/nullptr);
   rewriter.eraseOp(writeOp);
   return success();
 }
@@ -646,7 +648,8 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
         /*chunk_size=*/IntegerAttr{},
         /*l1_hint=*/xegpu::CachePolicyAttr{},
         /*l2_hint=*/xegpu::CachePolicyAttr{},
-        /*l3_hint=*/xegpu::CachePolicyAttr{});
+        /*l3_hint=*/xegpu::CachePolicyAttr{},
+        /*layout=*/nullptr);
 
     auto selectOp =
         arith::SelectOp::create(rewriter, loc, gatherOp.getMask(),
@@ -680,7 +683,8 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
                                   /*chunk_size=*/IntegerAttr{},
                                   /*l1_hint=*/xegpu::CachePolicyAttr{},
                                   /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                  /*l3_hint=*/xegpu::CachePolicyAttr{});
+                                  /*l3_hint=*/xegpu::CachePolicyAttr{},
+                                  /*layout=*/nullptr);
     rewriter.eraseOp(scatterOp);
     return success();
   }
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index fb51077b5dff3..4dd10bedc6d84 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -876,7 +876,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
-        l1_hint, l2_hint, l3_hint);
+        l1_hint, l2_hint, l3_hint, /*layout=*/nullptr);
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -892,7 +892,24 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
   auto offset = vector::FromElementsOp::create(builder, loc, type, values);
 
   build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*layout=*/nullptr);
+}
+
+void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
+                         Type valueType, Value source,
+                         ArrayRef<OpFoldResult> offsets, Value mask,
+                         IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
+                         xegpu::CachePolicyAttr l2_hint,
+                         xegpu::CachePolicyAttr l3_hint,
+                         xegpu::LayoutAttr layout) {
+  auto loc = source.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
+
+  build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
+        l2_hint, l3_hint, layout);
 }
 
 //===----------------------------------------------------------------------===//
@@ -943,7 +960,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
                            xegpu::CachePolicyAttr l2_hint,
                            xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*layout=*/nullptr);
 }
 
 void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
@@ -961,7 +978,23 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
 
   // Call the correct builder overload that does not expect result types.
   build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
-        l3_hint);
+        l3_hint, /*layout=*/nullptr);
+}
+
+void StoreScatterOp::build(
+    OpBuilder &builder, OperationState &state, Value value, Value dest,
+    ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
+    xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
+    xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) {
+  auto loc = dest.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
+
+  // Call the correct builder overload that does not expect result types.
+  build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
+        l3_hint, layout);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 90eae871a5ef3..14c49e7f45706 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -904,9 +904,16 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     if (dstTdescTy.getChunkSizeAsInt() > 1)
       instData.push_back(chunkSize);
   }
-  LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(
-      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
-      /*scattered=*/true);
+
+  LayoutInfo payloadLayout;
+
+  if (auto layout = storeScatter.getLayoutAttr()) {
+    payloadLayout = LayoutInfo(layout);
+  } else {
+    payloadLayout = getDefaultSIMTLayoutInfo(
+        payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+        /*scattered=*/true);
+  }
 
   LayoutInfo maskLayout =
       getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
@@ -1041,7 +1048,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
     }
     // If the result is a vector type, add a temporary layout attribute to the
     // op.
-    xegpu::setDistributeLayoutAttr(result, layout);
+    xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true);
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index e6e71cc29a80a..c3bf9606693a8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,12 +678,16 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    if (layout)
+      layout = layout.dropInstData();
+
     SmallVector<Value> newOps;
     for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) {
       auto newOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newValueTy, op.getSource(), o, m,
           rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(),
-          op.getL2HintAttr(), op.getL3HintAttr());
+          op.getL2HintAttr(), op.getL3HintAttr(), layout);
       newOps.push_back(newOp);
     }
 
@@ -774,12 +778,16 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    if (layout)
+      layout = layout.dropInstData();
+
     for (auto [v, o, m] :
          llvm::zip(convertedValues, convertedOffsets, convertedMasks)) {
       xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m,
                                     rewriter.getI64IntegerAttr(chunkSize),
                                     op.getL1HintAttr(), op.getL2HintAttr(),
-                                    op.getL3HintAttr());
+                                    op.getL3HintAttr(), layout);
     }
 
     rewriter.eraseOp(op);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 79eea55c8b78a..d12a04df5c46c 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -889,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
+        xegpu::getDistributeLayoutAttr(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -915,9 +915,8 @@ struct WgToSgLoadGatherOpWithOffset
          llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
       auto newLoadOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
-      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0),
-                                     layout.dropSgLayoutAndData());
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
+          layout.dropSgLayoutAndData());
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -942,8 +941,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getOperand(0));
+    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
+        xegpu::getDistributeLayoutAttr(op.getOperand(0)));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -965,7 +964,8 @@ struct WgToSgStoreScatterOpWithOffset
              adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) {
       auto store = xegpu::StoreScatterOp::create(
           rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr,
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
+          layout.dropSgLayoutAndData());
       // Update the layout attribute to drop sg_layout and sg_data.
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty()) {
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a38993e0c55b1..d575a415a3035 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -144,6 +144,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+    // check for "permament" layout only after "temporary" layout name lookup
+    // for backward compatibility
+    if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
+      return loadGatherOp.getLayoutAttr();
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -171,27 +176,77 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   std::string layoutName = xegpu::getLayoutName(opr);
   if (op->hasAttr(layoutName))
     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+  // check for "permament" layout only after "temporary" layout name lookup
+  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
+    if (auto layout = storeScatterOp.getLayoutAttr())
+      return layout;
+
   return getDistributeLayoutAttr(opr.get());
 }
 
+// Returns the permanent layout attribute for the given result if it's
+// available on the defining op. Otherwise returns the provided layout.
+xegpu::DistributeLayoutAttr
+maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
+                         const OpResult &result, mlir::Operation *owner,
+                         const std::string &name) {
+  xegpu::DistributeLayoutAttr candidate = layout;
+
+  if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
+    if (auto perm = loadOp.getLayoutAttr())
+      candidate = perm;
+  }
+
+  return candidate;
+}
+
+// Returns the permanent layout attribute for the given operand if it's
+// available on the defining op. Otherwise returns the provided layout.
+xegpu::DistributeLayoutAttr
+maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
+                         const OpOperand &operand, mlir::Operation *owner,
+                         const std::string &name) {
+  xegpu::DistributeLayoutAttr candidate = layout;
+  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
+
+  if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
+    if (idx == 0) {
+      if (auto perm = storeOp.getLayoutAttr())
+        candidate = perm;
+    }
+  }
+
+  return candidate;
+}
+
 template <typename T, typename>
 void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
-                                    const DistributeLayoutAttr layout) {
+                                    const DistributeLayoutAttr layout,
+                                    bool respectPermLayout) {
   Operation *owner = operandOrResult.getOwner();
   std::string name = xegpu::getLayoutName(operandOrResult);
-  if (layout && !owner->hasAttrOfType<DistributeLayoutAttr>(name))
-    owner->setAttr(name, layout);
+
+  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+    return;
+
+  DistributeLayoutAttr candidate = layout;
+  if (respectPermLayout)
+    candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name);
+
+  if (candidate)
+    owner->setAttr(name, candidate);
 }
 
 // Explicit instantiation for OpResult
 template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
     const mlir::OpResult &result,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
 
 // Explicit instantiation for OpOperand
 template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
     const mlir::OpOperand &operand,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
 
 void xegpu::setDistributeLayoutAttrs(
     Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 543e119d81d88..61e315d0d2080 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -215,6 +215,46 @@ func.func @scatter_ops(%src: memref<256xf16>) {
 }
 // -----
 gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_custom_perm_layout(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+  %4 = arith.addf %3, %3 : vector<16xf16>
+  xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+  %4 = arith.addf %3, %3 : vector<16xf16>
+  xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
+// -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
 // CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 52acde4dffc2e..8d98fcfd0d2c2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -285,7 +285,7 @@ gpu.module @test_distribution {
     // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
     // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
-    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
     // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
     // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>

From a50d036c0bf456f4d2b6d6a19f5e45f61860994c Mon Sep 17 00:00:00 2001
From: Manuel Carrasco <Manuel.Carrasco@amd.com>
Date: Tue, 4 Nov 2025 16:21:47 +0000
Subject: [PATCH 183/313] [NFC] [Build Fix] Fix failing test case due to
 missing host arch. (#166392)

This fixes a typo introduced in #165606 which makes the test case fail.

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
---
 clang/test/Driver/hip-spirv-translator-new-driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
index 315a74635b9b3..67d894e2eb506 100644
--- a/clang/test/Driver/hip-spirv-translator-new-driver.c
+++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
@@ -2,7 +2,7 @@
 // The input and output files cannot be the same.
 
 // RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \
-// RUN: --offload-arch=amdgcnspirv -x hip %s 2>&1 \
+// RUN: --target=x86_64-unknown-linux-gnu --offload-arch=amdgcnspirv -x hip %s 2>&1 \
 // RUN: | FileCheck %s
 
 // CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"

From 4749bf56a65e38ee7b05ac7f9fe261aab6cb5bc6 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 4 Nov 2025 16:24:24 +0000
Subject: [PATCH 184/313] [lldb] When starting in a hidden frame, don't skip
 over hidden frames when navigating up/down (#166394)

When stopped in a hidden frame (either because we selected the hidden
frame or hit a breakpoint inside it), a user most likely is intersted in
exploring the immediate frames around it. But currently issuing
`up`/`down` commands will unconditionally skip over all hidden frames.

This patch makes it so `up`/`down` commands don't skip hidden frames if
the frame we started it was a hidden frame.
---
 lldb/source/Commands/CommandObjectFrame.cpp   | 52 +++++++++++--------
 .../API/commands/frame/select-hidden/Makefile |  3 ++
 .../select-hidden/TestNavigateHiddenFrame.py  | 32 ++++++++++++
 .../API/commands/frame/select-hidden/main.cpp | 13 +++++
 4 files changed, 78 insertions(+), 22 deletions(-)
 create mode 100644 lldb/test/API/commands/frame/select-hidden/Makefile
 create mode 100644 lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
 create mode 100644 lldb/test/API/commands/frame/select-hidden/main.cpp

diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 88a02dce35b9d..9133359fbf537 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -265,6 +265,29 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_options; }
 
+private:
+  void SkipHiddenFrames(Thread &thread, uint32_t frame_idx) {
+    uint32_t candidate_idx = frame_idx;
+    const unsigned max_depth = 12;
+    for (unsigned num_try = 0; num_try < max_depth; ++num_try) {
+      if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) {
+        candidate_idx = UINT32_MAX;
+        break;
+      }
+      candidate_idx += *m_options.relative_frame_offset;
+      if (auto candidate_sp = thread.GetStackFrameAtIndex(candidate_idx)) {
+        if (candidate_sp->IsHidden())
+          continue;
+        // Now candidate_idx is the first non-hidden frame.
+        break;
+      }
+      candidate_idx = UINT32_MAX;
+      break;
+    };
+    if (candidate_idx != UINT32_MAX)
+      m_options.relative_frame_offset = candidate_idx - frame_idx;
+  }
+
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "thread" for validity as eCommandRequiresThread ensures
@@ -278,28 +301,13 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
       if (frame_idx == UINT32_MAX)
         frame_idx = 0;
 
-      // If moving up/down by one, skip over hidden frames.
-      if (*m_options.relative_frame_offset == 1 ||
-          *m_options.relative_frame_offset == -1) {
-        uint32_t candidate_idx = frame_idx;
-        const unsigned max_depth = 12;
-        for (unsigned num_try = 0; num_try < max_depth; ++num_try) {
-          if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) {
-            candidate_idx = UINT32_MAX;
-            break;
-          }
-          candidate_idx += *m_options.relative_frame_offset;
-          if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) {
-            if (candidate_sp->IsHidden())
-              continue;
-            // Now candidate_idx is the first non-hidden frame.
-            break;
-          }
-          candidate_idx = UINT32_MAX;
-          break;
-        };
-        if (candidate_idx != UINT32_MAX)
-          m_options.relative_frame_offset = candidate_idx - frame_idx;
+      // If moving up/down by one, skip over hidden frames, unless we started
+      // in a hidden frame.
+      if ((*m_options.relative_frame_offset == 1 ||
+           *m_options.relative_frame_offset == -1)) {
+        if (auto current_frame_sp = thread->GetStackFrameAtIndex(frame_idx);
+            !current_frame_sp->IsHidden())
+          SkipHiddenFrames(*thread, frame_idx);
       }
 
       if (*m_options.relative_frame_offset < 0) {
diff --git a/lldb/test/API/commands/frame/select-hidden/Makefile b/lldb/test/API/commands/frame/select-hidden/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
new file mode 100644
index 0000000000000..698447b552877
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
@@ -0,0 +1,32 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class NavigateHiddenFrameTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @add_test_categories(["libc++"])
+    def test(self):
+        """Test going up/down a backtrace but we started in a hidden frame."""
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "Break here", lldb.SBFileSpec("main.cpp")
+        )
+        # up
+        self.assertIn("__impl2", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("__impl1", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("__impl", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("non_impl", thread.selected_frame.GetFunctionName())
+
+        # Back down again.
+        self.expect("down")
+        self.assertIn("__impl", thread.selected_frame.GetFunctionName())
+        self.expect("down")
+        self.assertIn("__impl1", thread.selected_frame.GetFunctionName())
+        self.expect("down")
+        self.assertIn("__impl2", thread.selected_frame.GetFunctionName())
diff --git a/lldb/test/API/commands/frame/select-hidden/main.cpp b/lldb/test/API/commands/frame/select-hidden/main.cpp
new file mode 100644
index 0000000000000..dc97abb6323a4
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/main.cpp
@@ -0,0 +1,13 @@
+namespace std {
+namespace __1 {
+static const char *__impl2() { return "Break here"; }
+static const char *__impl1() { return __impl2(); }
+static const char *__impl() { return __impl1(); }
+static const char *non_impl() { return __impl(); }
+} // namespace __1
+} // namespace std
+
+int main() {
+  std::__1::non_impl();
+  __builtin_debugtrap();
+}

From ed7d6c3511df7f5d1dbf52579740f7f4e4ada4f9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 4 Nov 2025 08:28:51 -0800
Subject: [PATCH 185/313] [ADT] Deprecate a soft-deprecated APInt constructor
 (#166314)

This patch deprecates an APInt constructor that has been
soft-deprecated via comments since:

  commit 7a16288157efc5fb85fbe3b8b4c37071da7609a6
  Author: Jeffrey Yasskin <jyasskin@google.com>
  Date:   Mon Jul 18 21:45:40 2011 +0000

This patch updates a small number of remaining uses.
---
 lldb/source/Utility/RegisterValue.cpp | 8 +++++---
 llvm/include/llvm/ADT/APInt.h         | 1 +
 polly/lib/Support/GICHelper.cpp       | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp
index 12c349a143c0f..8b2af4e3d4f0e 100644
--- a/lldb/source/Utility/RegisterValue.cpp
+++ b/lldb/source/Utility/RegisterValue.cpp
@@ -206,7 +206,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo &reg_info,
         int128.x[0] = data2;
         int128.x[1] = data1;
       }
-      SetUInt128(llvm::APInt(128, 2, int128.x));
+      SetUInt128(llvm::APInt(128, int128.x));
     }
     break;
   case eEncodingIEEE754:
@@ -596,8 +596,10 @@ llvm::APInt RegisterValue::GetAsUInt128(const llvm::APInt &fail_value,
     case 8:
     case 16:
       return llvm::APInt(
-          BITWIDTH_INT128, NUM_OF_WORDS_INT128,
-          (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x);
+          BITWIDTH_INT128,
+          llvm::ArrayRef(
+              (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x,
+              NUM_OF_WORDS_INT128));
     }
   } break;
   }
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 26283d2437d48..fdb3b84b73a1f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -154,6 +154,7 @@ class [[nodiscard]] APInt {
   /// Once all uses of this constructor are migrated to other constructors,
   /// consider marking this overload ""= delete" to prevent calls from being
   /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor.
+  [[deprecated("Use other constructors of APInt")]]
   LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]);
 
   /// Construct an APInt from a string representation.
diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp
index 027e0194732f4..948bb6a9b9614 100644
--- a/polly/lib/Support/GICHelper.cpp
+++ b/polly/lib/Support/GICHelper.cpp
@@ -59,7 +59,7 @@ APInt polly::APIntFromVal(__isl_take isl_val *Val) {
   Data = (uint64_t *)malloc(NumChunks * ChunkSize);
   isl_val_get_abs_num_chunks(Val, ChunkSize, Data);
   int NumBits = CHAR_BIT * ChunkSize * NumChunks;
-  APInt A(NumBits, NumChunks, Data);
+  APInt A(NumBits, ArrayRef(Data, NumChunks));
 
   // As isl provides only an interface to obtain data that describes the
   // absolute value of an isl_val, A at this point always contains a positive

From 78769d51c6b183b6d88fbd1ef825fff7cf3aad21 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 4 Nov 2025 08:29:47 -0800
Subject: [PATCH 186/313] [LLDB] Don't check for libcxx if
 LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS is off

---
 lldb/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 513d1ec493ee1..818dff58aceeb 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -202,7 +202,7 @@ if(TARGET clang)
     else()
       # We require libcxx for the test suite, so if we aren't building it,
       # provide a helpful error about how to resolve the situation.
-      if(NOT LLDB_HAS_LIBCXX)
+      if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS AND NOT LLDB_HAS_LIBCXX)
         message(SEND_ERROR
           "LLDB test suite requires libc++, but it is currently disabled. "
           "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via "

From cc3ad201ecd50bcc85d1488084dda079c0671652 Mon Sep 17 00:00:00 2001
From: Marco Borgeaud <marco.borgeaud@sonarsource.com>
Date: Tue, 4 Nov 2025 17:39:23 +0100
Subject: [PATCH 187/313] [analyzer] Revert incorrect LazyCoumpoundVal changes
 (#163461)

Reverts #115917 and its follow up #116840.
Fixes #153782 and introduces regression tests.
Reopens #114270.
---
 clang/lib/StaticAnalyzer/Core/RegionStore.cpp | 48 -----------
 .../test/Analysis/NewDelete-checker-test.cpp  | 80 ++++++++++++++++++-
 clang/test/Analysis/ctor-trivial-copy.cpp     | 62 ++++++++++----
 clang/test/Analysis/explain-svals.cpp         |  2 +-
 clang/test/Analysis/iterator-modeling.cpp     |  2 -
 ...-modeling-aggressive-std-find-modeling.cpp | 10 ---
 .../test/Analysis/stl-algorithm-modeling.cpp  | 10 ---
 clang/test/Analysis/store-dump-orders.cpp     |  2 +-
 clang/test/Analysis/taint-generic.cpp         |  6 +-
 .../test/Analysis/template-param-objects.cpp  |  2 +-
 10 files changed, 131 insertions(+), 93 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 2838533c1a406..4f4824a3616ce 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -714,11 +714,6 @@ class RegionStoreManager : public StoreManager {
     return getBinding(getRegionBindings(S), L, T);
   }
 
-  std::optional<SVal> getUniqueDefaultBinding(RegionBindingsConstRef B,
-                                              const TypedValueRegion *R) const;
-  std::optional<SVal>
-  getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const;
-
   std::optional<SVal> getDefaultBinding(Store S, const MemRegion *R) override {
     RegionBindingsRef B = getRegionBindings(S);
     // Default bindings are always applied over a base region so look up the
@@ -2465,11 +2460,6 @@ SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B,
   // behavior doesn't depend on the struct layout.
   // This way even an empty struct can carry taint, no matter if creduce drops
   // the last field member or not.
-
-  // Try to avoid creating a LCV if it would anyways just refer to a single
-  // default binding.
-  if (std::optional<SVal> Val = getUniqueDefaultBinding(B, R))
-    return *Val;
   return createLazyBinding(B, R);
 }
 
@@ -2757,50 +2747,12 @@ RegionStoreManager::bindVector(LimitedRegionBindingsConstRef B,
   return NewB;
 }
 
-std::optional<SVal>
-RegionStoreManager::getUniqueDefaultBinding(RegionBindingsConstRef B,
-                                            const TypedValueRegion *R) const {
-  if (R != R->getBaseRegion())
-    return std::nullopt;
-
-  const auto *Cluster = B.lookup(R);
-  if (!Cluster || !llvm::hasSingleElement(*Cluster))
-    return std::nullopt;
-
-  const auto [Key, Value] = *Cluster->begin();
-  return Key.isDirect() ? std::optional<SVal>{} : Value;
-}
-
-std::optional<SVal>
-RegionStoreManager::getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const {
-  auto B = getRegionBindings(LCV.getStore());
-  return getUniqueDefaultBinding(B, LCV.getRegion());
-}
-
 std::optional<LimitedRegionBindingsRef> RegionStoreManager::tryBindSmallStruct(
     LimitedRegionBindingsConstRef B, const TypedValueRegion *R,
     const RecordDecl *RD, nonloc::LazyCompoundVal LCV) {
   if (B.hasExhaustedBindingLimit())
     return B.withValuesEscaped(LCV);
 
-  // If we try to copy a Conjured value representing the value of the whole
-  // struct, don't try to element-wise copy each field.
-  // That would unnecessarily bind Derived symbols slicing off the subregion for
-  // the field from the whole Conjured symbol.
-  //
-  //   struct Window { int width; int height; };
-  //   Window getWindow(); <-- opaque fn.
-  //   Window w = getWindow(); <-- conjures a new Window.
-  //   Window w2 = w; <-- trivial copy "w", calling "tryBindSmallStruct"
-  //
-  // We should not end up with a new Store for "w2" like this:
-  //   Direct [ 0..31]: Derived{Conj{}, w.width}
-  //   Direct [32..63]: Derived{Conj{}, w.height}
-  // Instead, we should just bind that Conjured value instead.
-  if (std::optional<SVal> Val = getUniqueDefaultBinding(LCV)) {
-    return B.addBinding(BindingKey::Make(R, BindingKey::Default), Val.value());
-  }
-
   FieldVector Fields;
 
   if (const CXXRecordDecl *Class = dyn_cast<CXXRecordDecl>(RD))
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index c417b9c2ac97e..fd831cc0985cc 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -3,13 +3,13 @@
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
 // RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
 // RUN:   -verify=expected,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
@@ -19,13 +19,13 @@
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++17 -fblocks %s \
+// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
 // RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
 // RUN:   -verify=expected,leak,inspection \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
@@ -503,3 +503,75 @@ namespace optional_union {
     custom_union_t a;
   } // leak-warning{{Potential leak of memory pointed to by 'a.present.q'}}
 }
+
+namespace gh153782 {
+
+// Ensure we do not regress on the following use case.
+
+namespace mutually_exclusive_test_case_1 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+void test_non_trivial_struct_assignment() {
+  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
+  object[0] = StorageWrapper(); // This assignment leads to the double-free.
+}
+} // mutually_exclusive_test_case_1
+
+namespace mutually_exclusive_test_case_2 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; }
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+void test_non_trivial_struct_assignment() {
+  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
+  // object[0] = StorageWrapper(); // Remove the source of double free to make the potential leak appear.
+} // leak-warning{{Potential leak of memory pointed to by 'object'}}
+} // mutually_exclusive_test_case_2
+
+namespace mutually_exclusive_test_case_3 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+struct TestDoubleFreeWithInitializerList {
+  StorageWrapper* Object;
+  TestDoubleFreeWithInitializerList()
+  : Object(new StorageWrapper[]{StorageWrapper()}) {
+    Object[0] = StorageWrapper(); // This assignment leads to the double-free.
+  }
+};
+} // mutually_exclusive_test_case_3
+
+} // namespace gh153782
diff --git a/clang/test/Analysis/ctor-trivial-copy.cpp b/clang/test/Analysis/ctor-trivial-copy.cpp
index 940ff9ba3ed9c..44990fc631d6d 100644
--- a/clang/test/Analysis/ctor-trivial-copy.cpp
+++ b/clang/test/Analysis/ctor-trivial-copy.cpp
@@ -5,8 +5,6 @@
 void clang_analyzer_printState();
 template <typename T> void clang_analyzer_dump_lref(T& param);
 template <typename T> void clang_analyzer_dump_val(T param);
-template <typename T> void clang_analyzer_denote(T param, const char *name);
-template <typename T> void clang_analyzer_express(T param);
 template <typename T> T conjure();
 template <typename... Ts> void nop(const Ts &... args) {}
 
@@ -42,10 +40,16 @@ void test_assign_return() {
 namespace trivial_struct_copy {
 
 void _01_empty_structs() {
-  clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{conj_$}}
+  clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{lazyCompoundVal}}
   empty Empty = conjure<empty>();
   empty Empty2 = Empty;
   empty Empty3 = Empty2;
+  // All of these should refer to the exact same LCV, because all of
+  // these trivial copies refer to the original conjured value.
+  // There were Unknown before:
+  clang_analyzer_dump_val(Empty);  // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Empty2); // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Empty3); // expected-warning {{lazyCompoundVal}}
 
   // We only have binding for the original Empty object, because copying empty
   // objects is a no-op in the performTrivialCopy. This is fine, because empty
@@ -67,20 +71,18 @@ void _01_empty_structs() {
 }
 
 void _02_structs_with_members() {
-  clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{conj_$}}
+  clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{lazyCompoundVal}}
   aggr Aggr = conjure<aggr>();
   aggr Aggr2 = Aggr;
   aggr Aggr3 = Aggr2;
-  // All of these should refer to the exact same symbol, because all of
+  // All of these should refer to the exact same LCV, because all of
   // these trivial copies refer to the original conjured value.
-  clang_analyzer_denote(Aggr, "$Aggr");
-  clang_analyzer_express(Aggr);  // expected-warning {{$Aggr}}
-  clang_analyzer_express(Aggr2); // expected-warning {{$Aggr}}
-  clang_analyzer_express(Aggr3); // expected-warning {{$Aggr}}
-
-  // We should have the same Conjured symbol for "Aggr", "Aggr2" and "Aggr3".
-  // We used to have Derived symbols for the individual fields that were
-  // copied as part of copying the whole struct.
+  clang_analyzer_dump_val(Aggr);  // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Aggr2); // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Aggr3); // expected-warning {{lazyCompoundVal}}
+
+  // We have fields in the struct we copy, thus we also have the entries for the copies
+  // (and for all of their fields).
   clang_analyzer_printState();
   // CHECK:       "store": { "pointer": "0x{{[0-9a-f]+}}", "items": [
   // CHECK-NEXT:    { "cluster": "GlobalInternalSpaceRegion", "pointer": "0x{{[0-9a-f]+}}", "items": [
@@ -93,10 +95,12 @@ void _02_structs_with_members() {
   // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ:conj_\$[0-9]+{int, LC[0-9]+, S[0-9]+, #[0-9]+}]]" }
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "Aggr2", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "Aggr3", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
   // CHECK-NEXT:    ]}
   // CHECK-NEXT:  ]},
 
@@ -113,3 +117,31 @@ void entrypoint() {
 }
 
 } // namespace trivial_struct_copy
+
+namespace gh153782 {
+
+// Ensure we do not regress on the following use cases.
+// The assumption made on a field in `setPtr` should apply to the returned copy in `func`.
+struct Status { int error; };
+Status getError();
+
+Status setPtr(int **outptr, int* ptr) {
+  Status e = getError();
+  if (e.error != 0) return e; // When assuming the error field is non-zero,
+  *outptr = ptr;              // this is not executed
+  return e;
+}
+
+int func() {
+  int *ptr = nullptr;
+  int x = 42;
+  if (setPtr(&ptr, &x).error == 0) {
+    // The assumption made in get() SHOULD match the assumption about
+    // the returned value, hence the engine SHOULD NOT assume ptr is null.
+    clang_analyzer_dump_val(ptr); // expected-warning {{&x}}
+    return *ptr;
+  }
+  return 0;
+}
+
+} // namespace gh153782
diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
index dfc650223c9e7..9474aa7c7dbb1 100644
--- a/clang/test/Analysis/explain-svals.cpp
+++ b/clang/test/Analysis/explain-svals.cpp
@@ -99,7 +99,7 @@ class C {
 } // end of anonymous namespace
 
 void test_6() {
-  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \+0\)'$}}}}
+  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of 1st parameter of function 'clang_analyzer_explain\(\)'$}}}}
   clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}}
 }
 
diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp
index 78882da4431fd..f1538839d06c8 100644
--- a/clang/test/Analysis/iterator-modeling.cpp
+++ b/clang/test/Analysis/iterator-modeling.cpp
@@ -2035,7 +2035,6 @@ void print_state(std::vector<int> &V) {
   // CHECK:      "checker_messages": [
   // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
   // CHECK-NEXT:     "Iterator Positions :",
-  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
   // CHECK-NEXT:     "i0 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
   // CHECK-NEXT:   ]}
 
@@ -2046,7 +2045,6 @@ void print_state(std::vector<int> &V) {
   // CHECK:      "checker_messages": [
   // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
   // CHECK-NEXT:     "Iterator Positions :",
-  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
   // CHECK-NEXT:     "i1 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
   // CHECK-NEXT:   ]}
 
diff --git a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
index 191af95cd2b9c..98301cf7274fc 100644
--- a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
+++ b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
@@ -4,16 +4,6 @@
 // RUN:  -analyzer-config alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling=true\
 // RUN:  -verify
 
-// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
-// these tests started failing after we just directly copy the symbol
-// representing the value of a variable instead of creating a LazyCompoundVal
-// of that single conjured value.
-// In theory, it shouldn't matter if we eagerly copy the value that we would
-// "load" from the LCV once requested or just directly binding the backing symbol.
-// Yet, these tests fail, so there is likely messed up how/what the checker
-// metadata is associated with.
-// XFAIL: *
-
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
diff --git a/clang/test/Analysis/stl-algorithm-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling.cpp
index f7029c79b0942..5549c24a8c220 100644
--- a/clang/test/Analysis/stl-algorithm-modeling.cpp
+++ b/clang/test/Analysis/stl-algorithm-modeling.cpp
@@ -3,16 +3,6 @@
 // RUN:  -analyzer-config aggressive-binary-operation-simplification=true\
 // RUN:  -verify
 
-// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
-// these tests started failing after we just directly copy the symbol
-// representing the value of a variable instead of creating a LazyCompoundVal
-// of that single conjured value.
-// In theory, it shouldn't matter if we eagerly copy the value that we would
-// "load" from the LCV once requested or just directly binding the backing symbol.
-// Yet, these tests fail, so there is likely messed up how/what the checker
-// metadata is associated with.
-// XFAIL: *
-
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
diff --git a/clang/test/Analysis/store-dump-orders.cpp b/clang/test/Analysis/store-dump-orders.cpp
index dbe93f1c5183a..d99f581f00fe1 100644
--- a/clang/test/Analysis/store-dump-orders.cpp
+++ b/clang/test/Analysis/store-dump-orders.cpp
@@ -41,7 +41,7 @@ void test_output(int n) {
   // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "objfirst", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
+  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "lazyCompoundVal
   // CHECK-NEXT:      { "kind": "Direct", "offset": 320, "value": "1 S32b" },
   // CHECK-NEXT:      { "kind": "Direct", "offset": 352, "value": "2 S32b" },
   // CHECK-NEXT:      { "kind": "Direct", "offset": 384, "value": "3 S32b" }
diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp
index fc7c37300d3fc..4b8d9ab68ff84 100644
--- a/clang/test/Analysis/taint-generic.cpp
+++ b/clang/test/Analysis/taint-generic.cpp
@@ -158,7 +158,11 @@ void top() {
   clang_analyzer_isTainted(E); // expected-warning {{NO}}
 
   Aggr A = mySource1<Aggr>();
-  clang_analyzer_isTainted(A);      // expected-warning {{YES}}
+  // FIXME Ideally, both A and A.data should be tainted. However, the
+  //       implementation used by e5ac9145ba29 ([analyzer][taint] Recognize
+  //       tainted LazyCompoundVals (4/4) (#115919), 2024-11-15) led to FPs and
+  //       FNs in various scenarios and had to be reverted to fix #153782.
+  clang_analyzer_isTainted(A);      // expected-warning {{NO}}
   clang_analyzer_isTainted(A.data); // expected-warning {{YES}}
 }
 } // namespace gh114270
diff --git a/clang/test/Analysis/template-param-objects.cpp b/clang/test/Analysis/template-param-objects.cpp
index b065f8756d4d8..dde95fa62cb65 100644
--- a/clang/test/Analysis/template-param-objects.cpp
+++ b/clang/test/Analysis/template-param-objects.cpp
@@ -11,7 +11,7 @@ bool operator ==(Box lhs, Box rhs) {
   return lhs.value == rhs.value;
 }
 template <Box V> void dumps() {
-  clang_analyzer_dump(V);        // expected-warning {{Unknown}}
+  clang_analyzer_dump(V);        // expected-warning {{lazyCompoundVal}}
   clang_analyzer_dump(&V);       // expected-warning {{Unknown}}
   clang_analyzer_dump(V.value);  // expected-warning {{Unknown}} FIXME: It should be '6 S32b'.
   clang_analyzer_dump(&V.value); // expected-warning {{Unknown}}

From 2dc0fa1000d651c902edad7d206785de7efc75f3 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Tue, 4 Nov 2025 17:43:17 +0100
Subject: [PATCH 188/313] [Flang] Nested directives are comments (#166348)

Directives cannot be nested. A directive sentinel that appears within
another directive should be ignored, and instead fall back to be treated
as a line comment.

Fixes: #165874
---
 flang/lib/Parser/prescan.cpp                  | 2 +-
 flang/test/Parser/OpenMP/nested-directive.f90 | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Parser/OpenMP/nested-directive.f90

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index efce8fc3d2e35..8cccd84f9fa19 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const {
     return true; // skip over ignored columns in right margin (73:80)
   } else if (*at_ == '!' && !inCharLiteral_ &&
       (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) {
-    return !IsCompilerDirectiveSentinel(at_ + 1);
+    return InCompilerDirective() || !IsCompilerDirectiveSentinel(at_ + 1);
   } else {
     return false;
   }
diff --git a/flang/test/Parser/OpenMP/nested-directive.f90 b/flang/test/Parser/OpenMP/nested-directive.f90
new file mode 100644
index 0000000000000..2a10bbe666bb8
--- /dev/null
+++ b/flang/test/Parser/OpenMP/nested-directive.f90
@@ -0,0 +1,7 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s --match-full-lines
+
+subroutine func
+  implicit none
+! CHECK: !$OMP NOTHING
+  !$omp nothing !$omp Cannot nest directives inside directives; must be interpreted as a comment
+end subroutine func

From 89ec96b8b4f4a3115689b045cd64afae1c28044e Mon Sep 17 00:00:00 2001
From: Tim Corringham <timothy.corringham@amd.com>
Date: Tue, 4 Nov 2025 17:04:39 +0000
Subject: [PATCH 189/313] [HLSL] Implement the f16tof32() intrinsic (#165860)

Implement the f16tof32() intrinsic, including DXILand SPIRV codegen, and
associated tests.

Fixes #99112

---------

Co-authored-by: Tim Corringham <tcorring@amd.com>
---
 clang/include/clang/Basic/Builtins.td         |   6 +
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          |  54 +++++++
 .../lib/Headers/hlsl/hlsl_alias_intrinsics.h  |  21 +++
 clang/lib/Sema/SemaHLSL.cpp                   |  57 ++++++--
 .../builtins/f16tof32-builtin.hlsl            |  30 ++++
 clang/test/CodeGenHLSL/builtins/f16tof32.hlsl |  30 ++++
 .../SemaHLSL/BuiltIns/f16tof32-errors.hlsl    | 134 ++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   3 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   3 +
 llvm/lib/Target/DirectX/DXIL.td               |   9 ++
 .../DirectX/DirectXTargetTransformInfo.cpp    |   8 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |   4 +
 llvm/test/CodeGen/DirectX/f16tof32.ll         |  57 ++++++++
 llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll    |  18 +++
 14 files changed, 419 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
 create mode 100644 clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/f16tof32.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 2b400b012d6ed..0275447e1090a 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5235,6 +5235,12 @@ def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
   let Prototype = "T(unsigned int, T)";
 }
 
+def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_f16tof32"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index fbf4a5722caed..b6928ce7d9c44 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -160,6 +160,57 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
   return LastInst;
 }
 
+static Value *handleElementwiseF16ToF32(CodeGenFunction &CGF,
+                                        const CallExpr *E) {
+  Value *Op0 = CGF.EmitScalarExpr(E->getArg(0));
+  QualType Op0Ty = E->getArg(0)->getType();
+  llvm::Type *ResType = CGF.FloatTy;
+  uint64_t NumElements = 0;
+  if (Op0->getType()->isVectorTy()) {
+    NumElements =
+        E->getArg(0)->getType()->castAs<clang::VectorType>()->getNumElements();
+    ResType =
+        llvm::VectorType::get(ResType, ElementCount::getFixed(NumElements));
+  }
+  if (!Op0Ty->hasUnsignedIntegerRepresentation())
+    llvm_unreachable(
+        "f16tof32 operand must have an unsigned int representation");
+
+  if (CGF.CGM.getTriple().isDXIL())
+    return CGF.Builder.CreateIntrinsic(ResType, Intrinsic::dx_legacyf16tof32,
+                                       ArrayRef<Value *>{Op0}, nullptr,
+                                       "hlsl.f16tof32");
+
+  if (CGF.CGM.getTriple().isSPIRV()) {
+    // We use the SPIRV UnpackHalf2x16 operation to avoid the need for the
+    // Int16 and Float16 capabilities
+    auto UnpackType =
+        llvm::VectorType::get(CGF.FloatTy, ElementCount::getFixed(2));
+    if (NumElements == 0) {
+      // a scalar input - simply extract the first element of the unpacked
+      // vector
+      Value *Unpack = CGF.Builder.CreateIntrinsic(
+          UnpackType, Intrinsic::spv_unpackhalf2x16, ArrayRef<Value *>{Op0});
+      return CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
+    } else {
+      // a vector input - build a congruent output vector by iterating through
+      // the input vector calling unpackhalf2x16 for each element
+      Value *Result = PoisonValue::get(ResType);
+      for (uint64_t i = 0; i < NumElements; i++) {
+        Value *InVal = CGF.Builder.CreateExtractElement(Op0, i);
+        Value *Unpack = CGF.Builder.CreateIntrinsic(
+            UnpackType, Intrinsic::spv_unpackhalf2x16,
+            ArrayRef<Value *>{InVal});
+        Value *Res = CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
+        Result = CGF.Builder.CreateInsertElement(Result, Res, i);
+      }
+      return Result;
+    }
+  }
+
+  llvm_unreachable("Intrinsic F16ToF32 not supported by target architecture");
+}
+
 static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr,
                                LValue &Stride) {
   // Figure out the stride of the buffer elements from the handle type.
@@ -579,6 +630,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(),
         ArrayRef<Value *>{X}, nullptr, "hlsl.degrees");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
+    return handleElementwiseF16ToF32(*this, E);
+  }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     if (!E->getArg(0)->getType()->hasFloatingRepresentation())
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index a918af39e4074..4c5861c2c5f9d 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -1052,6 +1052,27 @@ float3 exp2(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2)
 float4 exp2(float4);
 
+//===----------------------------------------------------------------------===//
+// f16tof32 builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn float f16tof32(uint x)
+/// \brief Returns the half value stored in the low 16 bits of the uint arg
+/// converted to a float.
+/// \param x The uint containing two half values.
+///
+/// The float value of the half value found in the low 16 bits of the \a xi
+/// parameter.
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float f16tof32(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float2 f16tof32(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float3 f16tof32(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float4 f16tof32(uint4);
+
 //===----------------------------------------------------------------------===//
 // firstbithigh builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 94a490a8f68dc..b9707f0036765 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2802,6 +2802,23 @@ static bool CheckUnsignedIntRepresentation(Sema *S, SourceLocation Loc,
   return false;
 }
 
+static bool CheckExpectedBitWidth(Sema *S, CallExpr *TheCall,
+                                  unsigned ArgOrdinal, unsigned Width) {
+  QualType ArgTy = TheCall->getArg(0)->getType();
+  if (auto *VTy = ArgTy->getAs<VectorType>())
+    ArgTy = VTy->getElementType();
+  // ensure arg type has expected bit width
+  uint64_t ElementBitCount =
+      S->getASTContext().getTypeSizeInChars(ArgTy).getQuantity() * 8;
+  if (ElementBitCount != Width) {
+    S->Diag(TheCall->getArg(0)->getBeginLoc(),
+            diag::err_integer_incorrect_bit_count)
+        << Width << ElementBitCount;
+    return true;
+  }
+  return false;
+}
+
 static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                        QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
@@ -2961,24 +2978,16 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
                                    CheckUnsignedIntVecRepresentation))
       return true;
 
-    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
     // ensure arg integers are 32-bits
-    uint64_t ElementBitCount = getASTContext()
-                                   .getTypeSizeInChars(VTy->getElementType())
-                                   .getQuantity() *
-                               8;
-    if (ElementBitCount != 32) {
-      SemaRef.Diag(TheCall->getBeginLoc(),
-                   diag::err_integer_incorrect_bit_count)
-          << 32 << ElementBitCount;
+    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
       return true;
-    }
 
     // ensure both args are vectors of total bit size of a multiple of 64
+    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
     int NumElementsArg = VTy->getNumElements();
     if (NumElementsArg != 2 && NumElementsArg != 4) {
       SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count)
-          << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount;
+          << 1 /*a multiple of*/ << 64 << NumElementsArg * 32;
       return true;
     }
 
@@ -3295,7 +3304,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   // Note these are llvm builtins that we want to catch invalid intrinsic
-  // generation. Normal handling of these builitns will occur elsewhere.
+  // generation. Normal handling of these builtins will occur elsewhere.
   case Builtin::BI__builtin_elementwise_bitreverse: {
     // does not include a check for number of arguments
     // because that is done previously
@@ -3405,6 +3414,30 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     }
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
+    if (SemaRef.checkArgCount(TheCall, 1))
+      return true;
+    if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
+                                   CheckUnsignedIntRepresentation))
+      return true;
+    // ensure arg integers are 32 bits
+    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
+      return true;
+    // check it wasn't a bool type
+    QualType ArgTy = TheCall->getArg(0)->getType();
+    if (auto *VTy = ArgTy->getAs<VectorType>())
+      ArgTy = VTy->getElementType();
+    if (ArgTy->isBooleanType()) {
+      SemaRef.Diag(TheCall->getArg(0)->getBeginLoc(),
+                   diag::err_builtin_invalid_arg_type)
+          << 1 << /* scalar or vector of */ 5 << /* unsigned int */ 3
+          << /* no fp */ 0 << TheCall->getArg(0)->getType();
+      return true;
+    }
+
+    SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().FloatTy);
+    break;
+  }
   }
   return false;
 }
diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
new file mode 100644
index 0000000000000..65dba664bb5ea
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+// CHECK: define hidden noundef nofpclass(nan inf) float
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
+// CHECK: ret float %hlsl.f16tof32
+// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
+float test_scalar(uint p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
+// CHECK: ret <2 x float> %hlsl.f16tof32
+// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
+float2 test_uint2(uint2 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
+// CHECK: ret <3 x float> %hlsl.f16tof32
+// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
+float3 test_uint3(uint3 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
+// CHECK: ret <4 x float> %hlsl.f16tof32
+// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
+float4 test_uint4(uint4 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+
+
diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
new file mode 100644
index 0000000000000..b68bc197f16c5
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+// CHECK: define hidden noundef nofpclass(nan inf) float
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
+// CHECK: ret float %hlsl.f16tof32
+// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
+float test_scalar(uint p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
+// CHECK: ret <2 x float> %hlsl.f16tof32
+// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
+float2 test_uint2(uint2 p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
+// CHECK: ret <3 x float> %hlsl.f16tof32
+// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
+float3 test_uint3(uint3 p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
+// CHECK: ret <4 x float> %hlsl.f16tof32
+// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
+float4 test_uint4(uint4 p0) { return f16tof32(p0); }
+
+
+
diff --git a/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
new file mode 100644
index 0000000000000..8f2f9308ed966
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
@@ -0,0 +1,134 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
+
+float builtin_f16tof32_too_few_arg() {
+  return __builtin_hlsl_elementwise_f16tof32();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+}
+
+float builtin_f16tof32_too_many_arg(uint p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 2 were provided}}
+}
+
+float builtin_f16tof32_bool(bool p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
+}
+
+float builtin_f16tof32_bool4(bool4 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool4' (aka 'vector<bool, 4>')}}
+}
+
+float builtin_f16tof32_short(short p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
+}
+
+float builtin_f16tof32_unsigned_short(unsigned short p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+float builtin_f16tof32_int(int p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
+}
+
+float builtin_f16tof32_int64_t(long p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
+}
+
+float2 builtin_f16tof32_int2_to_float2_promotion(int2 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int2' (aka 'vector<int, 2>'))}}
+}
+
+float builtin_f16tof32_half(half p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
+}
+
+float builtin_f16tof32_half4(half4 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half4' (aka 'vector<half, 4>'))}}
+}
+
+float builtin_f16tof32_float(float p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
+}
+
+float builtin_f16tof32_double(double p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
+}
+
+float f16tof32_too_few_arg() {
+  return f16tof32();
+  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
+}
+
+float f16tof32_too_many_arg(uint p0) {
+  return f16tof32(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
+}
+
+float f16tof32_bool(bool p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
+}
+
+float f16tof32_bool3(bool3 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool3' (aka 'vector<bool, 3>'))}}
+}
+
+
+float f16tof32_int16_t(short p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
+}
+
+float f16tof32_int16_t(unsigned short p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+float f16tof32_int(int p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
+}
+
+float f16tof32_int64_t(long p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
+}
+
+float2 f16tof32_int2_to_float2_promotion(int3 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int3' (aka 'vector<int, 3>'))}}
+}
+
+float f16tof32_half(half p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
+}
+
+float f16tof32_half2(half2 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half2' (aka 'vector<half, 2>'))}}
+}
+
+float f16tof32_float(float p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
+}
+
+float f16tof32_double(double p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d6b85630eb979..9924b905aee63 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -140,6 +140,9 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1
 def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
     [llvm_anyfloat_ty], [IntrNoMem]>;
 
+def int_dx_legacyf16tof32 : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+    [llvm_anyint_ty], [IntrNoMem]>;
+
 def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem]>;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index bc51fb639fd75..f39c6cda2c579 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -200,4 +200,7 @@ def int_spv_resource_nonuniformindex
   def int_spv_generic_cast_to_ptr_explicit
     : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [generic_ptr_ty],
        [IntrNoMem, NoUndef<RetIndex>]>;
+
+  def int_spv_unpackhalf2x16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i32_ty], [IntrNoMem]>;
+
 }
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 7ae500a55b92d..67437f6969b27 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
   let attributes = [Attributes<DXIL1_0, []>];
 }
 
+def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> {
+  let Doc = "returns the float16 stored in the low-half of the uint converted "
+            "to a float";
+  let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>];
+  let arguments = [Int32Ty];
+  let result = FloatTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def WaveAllBitCount : DXILOp<135, waveAllOp> {
   let Doc = "returns the count of bits set to 1 across the wave";
   let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>];
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 60dfd9650937c..6cacbf6564db2 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                             int OpdIdx) const {
   switch (ID) {
   case Intrinsic::dx_asdouble:
-  case Intrinsic::dx_isinf:
-  case Intrinsic::dx_isnan:
   case Intrinsic::dx_firstbitlow:
-  case Intrinsic::dx_firstbituhigh:
   case Intrinsic::dx_firstbitshigh:
+  case Intrinsic::dx_firstbituhigh:
+  case Intrinsic::dx_isinf:
+  case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
     return OpdIdx == 0;
   default:
     return OpdIdx == -1;
@@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_frac:
   case Intrinsic::dx_isinf:
   case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
   case Intrinsic::dx_rsqrt:
   case Intrinsic::dx_saturate:
   case Intrinsic::dx_splitdouble:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3f0424f436c72..245e5a2894604 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_resource_nonuniformindex: {
     return selectResourceNonUniformIndex(ResVReg, ResType, I);
   }
+  case Intrinsic::spv_unpackhalf2x16: {
+    return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16);
+  }
+
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll
new file mode 100644
index 0000000000000..edc5c1942e8bd
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/f16tof32.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s
+
+define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0)
+  ; CHECK : ret float [[UINT]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0)
+  ret float %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]])
+  ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]])
+  ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1
+  ; CHECK : ret <2 x float>  [[FLOAT2_1]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0)
+  ret <2 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]])
+  ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]])
+  ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2
+  ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]])
+  ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1
+  ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2
+  ; CHECK : ret <3 x float> [[FLOAT3_2]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0)
+  ret <3 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]])
+  ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]])
+  ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2
+  ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]])
+  ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3
+  ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]])
+  ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1
+  ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2
+  ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3
+  ; CHECK : ret <4 x float> [[FLOAT4_3]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0)
+  ret <4 x float> %hlsl.f16tof32
+}
diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
new file mode 100644
index 0000000000000..6a9ce4515f5c0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0
+; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32
+; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2
+
+; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]]
+; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]]
+; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0
+; CHECK: OpReturnValue [[UNPACK]]
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+  %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+  %3 = extractelement <2 x float> %2, i64 0
+  ret float %3
+}
+

From 83d1599871b2bf7cf8a093c272367a141767a895 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Tue, 4 Nov 2025 09:04:56 -0800
Subject: [PATCH 190/313] [lldb-dap] Addressing orphaned processes in tests.
 (#166205)

In lldb-dap tests, we sometimes spawn subprocesses directly but do not
always correctly clean them up.

This can cause some tests, like the `TestDAP_disconnect.test_attach` to
hang and not properly respect timeouts.

To fix this, I am passing the `lldbtest.Base.spawnSubprocess` helper to
the adapter client so it can be used spawn subprocesses in a way that we
can ensure they're cleaned up.
---
 .../test/tools/lldb-dap/dap_server.py         | 75 ++++++++++---------
 .../test/tools/lldb-dap/lldbdap_testcase.py   |  1 +
 .../lldb-dap/disconnect/TestDAP_disconnect.py |  9 +--
 .../tools/lldb-dap/server/TestDAP_server.py   |  3 +-
 4 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index d892c01f0bc71..ac550962cfb85 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -32,6 +32,10 @@
 # timeout by a factor of 10 if ASAN is enabled.
 DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
 
+# See lldbtest.Base.spawnSubprocess, which should help ensure any processes
+# created by the DAP client are terminated correctly when the test ends.
+SpawnHelperCallback = Callable[[str, List[str], List[str]], subprocess.Popen]
+
 ## DAP type references
 
 
@@ -191,14 +195,16 @@ def __init__(
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: list[str],
-        log_file: Optional[TextIO] = None,
+        init_commands: Optional[List[str]] = None,
+        log_file: Optional[str] = None,
+        spawn_helper: Optional[SpawnHelperCallback] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
         self.trace_file: Optional[TextIO] = None
         self.log_file = log_file
         self.send = send
         self.recv = recv
+        self.spawn_helper = spawn_helper
 
         # Packets that have been received and processed but have not yet been
         # requested by a test case.
@@ -211,7 +217,7 @@ def __init__(
         self._recv_thread = threading.Thread(target=self._read_packet_thread)
 
         # session state
-        self.init_commands = init_commands
+        self.init_commands = init_commands if init_commands else []
         self.exit_status: Optional[int] = None
         self.capabilities: Dict = {}
         self.initialized: bool = False
@@ -310,11 +316,6 @@ def collect_output(
             output += self.get_output(category, clear=clear)
         return output
 
-    def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]):
-        with self.recv_condition:
-            self.recv_packets.append(packet)
-            self.recv_condition.notify()
-
     def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
         """Handles an incoming packet.
 
@@ -460,22 +461,11 @@ def _handle_reverse_request(self, request: Request) -> None:
         self.reverse_requests.append(request)
         arguments = request.get("arguments")
         if request["command"] == "runInTerminal" and arguments is not None:
-            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
-            print("spawning...", arguments["args"])
-            proc = subprocess.Popen(
-                arguments["args"],
-                env=arguments.get("env", {}),
-                cwd=arguments.get("cwd", None),
-                stdin=subprocess.DEVNULL,
-                stdout=sys.stderr,
-                stderr=sys.stderr,
-                shell=in_shell,
-            )
-            body = {}
-            if in_shell:
-                body["shellProcessId"] = proc.pid
-            else:
-                body["processId"] = proc.pid
+            assert self.spawn_helper is not None, "Not configured to spawn subprocesses"
+            [exe, *args] = arguments["args"]
+            env = [f"{k}={v}" for k, v in arguments.get("env", {}).items()]
+            proc = self.spawn_helper(exe, args, env)
+            body = {"processId": proc.pid}
             self.send_packet(
                 {
                     "type": "response",
@@ -1501,12 +1491,14 @@ def request_setInstructionBreakpoints(self, memory_reference=[]):
 class DebugAdapterServer(DebugCommunication):
     def __init__(
         self,
+        *,
         executable: Optional[str] = None,
         connection: Optional[str] = None,
-        init_commands: list[str] = [],
-        log_file: Optional[TextIO] = None,
-        env: Optional[dict[str, str]] = None,
-        additional_args: list[str] = [],
+        init_commands: Optional[list[str]] = None,
+        log_file: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        additional_args: Optional[List[str]] = None,
+        spawn_helper: Optional[SpawnHelperCallback] = None,
     ):
         self.process = None
         self.connection = None
@@ -1532,13 +1524,21 @@ def __init__(
                 s = socket.create_connection((host.strip("[]"), int(port)))
             else:
                 raise ValueError("invalid connection: {}".format(connection))
-            DebugCommunication.__init__(
-                self, s.makefile("rb"), s.makefile("wb"), init_commands, log_file
+            super().__init__(
+                s.makefile("rb"),
+                s.makefile("wb"),
+                init_commands,
+                log_file,
+                spawn_helper,
             )
             self.connection = connection
         else:
-            DebugCommunication.__init__(
-                self, self.process.stdout, self.process.stdin, init_commands, log_file
+            super().__init__(
+                self.process.stdout,
+                self.process.stdin,
+                init_commands,
+                log_file,
+                spawn_helper,
             )
 
     @classmethod
@@ -1546,14 +1546,14 @@ def launch(
         cls,
         *,
         executable: str,
-        env: Optional[dict[str, str]] = None,
-        log_file: Optional[TextIO] = None,
+        env: Optional[Dict[str, str]] = None,
+        log_file: Optional[str] = None,
         connection: Optional[str] = None,
         connection_timeout: Optional[int] = None,
-        additional_args: list[str] = [],
+        additional_args: Optional[List[str]] = None,
     ) -> tuple[subprocess.Popen, Optional[str]]:
         adapter_env = os.environ.copy()
-        if env is not None:
+        if env:
             adapter_env.update(env)
 
         if log_file:
@@ -1561,7 +1561,8 @@ def launch(
         args = [executable]
 
         # Add additional arguments first (like --no-lldbinit)
-        args.extend(additional_args)
+        if additional_args:
+            args.extend(additional_args)
 
         if connection is not None:
             args.append("--connection")
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index c6c4a3e2a4e1e..71ca60ebe8d34 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -39,6 +39,7 @@ def create_debug_adapter(
             log_file=log_file_path,
             env=lldbDAPEnv,
             additional_args=additional_args or [],
+            spawn_helper=self.spawnSubprocess,
         )
 
     def build_and_create_debug_adapter(
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index 09e3f62f0eead..19f88d88c2ff4 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -3,17 +3,15 @@
 """
 
 
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 import lldbdap_testcase
-import subprocess
 import time
 import os
 
 
-class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
+class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase):
     source = "main.cpp"
 
     def disconnect_and_assert_no_output_printed(self):
@@ -67,10 +65,11 @@ def test_attach(self):
             lambda: self.run_platform_command("rm %s" % (sync_file_path))
         )
 
-        self.process = subprocess.Popen([program, sync_file_path])
+        proc = self.spawnSubprocess(program, [sync_file_path])
         lldbutil.wait_for_file_on_target(self, sync_file_path)
 
-        self.attach(pid=self.process.pid, disconnectAutomatically=False)
+        self.attach(pid=proc.pid, disconnectAutomatically=False, stopOnEntry=True)
+        self.continue_to_next_stop()
         response = self.dap_server.request_evaluate("wait_for_attach = false;")
         self.assertTrue(response["success"])
 
diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
index 12b321cf42778..3c53cf2ed3460 100644
--- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
+++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
@@ -37,7 +37,7 @@ def cleanup():
 
     def run_debug_session(self, connection, name, sleep_seconds_in_middle=None):
         self.dap_server = dap_server.DebugAdapterServer(
-            connection=connection,
+            connection=connection, spawn_helper=self.spawnSubprocess
         )
         program = self.getBuildArtifact("a.out")
         source = "main.c"
@@ -94,6 +94,7 @@ def test_server_interrupt(self):
         (process, connection) = self.start_server(connection="listen://localhost:0")
         self.dap_server = dap_server.DebugAdapterServer(
             connection=connection,
+            spawn_helper=self.spawnSubprocess,
         )
         program = self.getBuildArtifact("a.out")
         source = "main.c"

From af9a4263a1a209953a1d339ef781a954e31268ff Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 4 Nov 2025 17:08:12 +0000
Subject: [PATCH 191/313] [LAA] Only use inbounds/nusw in isNoWrap if the GEP
 is dereferenced. (#161445)

Update isNoWrap to only use the inbounds/nusw flags from GEPs that are
guaranteed to be dereferenced on every iteration. This fixes a case
where we incorrectly determine no dependence.

I think the issue is isolated to code that evaluates the resulting
AddRec at BTC, just using it to compute the distance between accesses
should still be fine; if the access does not execute in a given
iteration, there's no dependence in that iteration. But isolating the
code is not straight-forward, so be conservative for now. The practical
impact should be very minor (only one loop changed across a corpus with
27k modules from large C/C++ workloads.

Fixes https://github.com/llvm/llvm-project/issues/160912.

PR: https://github.com/llvm/llvm-project/pull/161445
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |  2 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 44 ++++++---
 llvm/lib/Analysis/VectorUtils.cpp             | 11 ++-
 .../AArch64/AArch64TargetTransformInfo.cpp    | 10 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  9 +-
 .../Transforms/Scalar/LoopLoadElimination.cpp | 16 ++--
 .../Vectorize/LoopVectorizationLegality.cpp   |  5 +-
 .../inbounds-gep-in-predicated-blocks.ll      | 17 +++-
 .../RISCV/masked_gather_scatter.ll            | 40 +++++---
 .../x86-interleaved-accesses-masked-group.ll  | 91 +++++++++++++++++--
 10 files changed, 188 insertions(+), 57 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 84b4ad7c1d5a9..c85ef3e131068 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 /// result of this function is undefined.
 LLVM_ABI std::optional<int64_t>
 getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-             const Loop *Lp,
+             const Loop *Lp, const DominatorTree &DT,
              const DenseMap<Value *, const SCEV *> &StridesMap =
                  DenseMap<Value *, const SCEV *>(),
              bool Assume = false, bool ShouldCheckWrap = true);
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e27a9b1c44014..5d88e5f54e3d6 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -806,11 +806,11 @@ class AccessAnalysis {
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
   AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA,
+                 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE,
                  SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
-      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
-        LoopAliasScopes(LoopAliasScopes) {
+      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
+        PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
     // We're analyzing dependences across loop iterations.
     BAA.enableCrossIterationMode();
   }
@@ -934,6 +934,9 @@ class AccessAnalysis {
   /// The LoopInfo of the loop being checked.
   const LoopInfo *LI;
 
+  /// The dominator tree of the function.
+  DominatorTree &DT;
+
   /// Sets of potentially dependent accesses - members of one set share an
   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
   /// dependence check.
@@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
 /// informating from the IR pointer value to determine no-wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
                      Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
+                     const DominatorTree &DT,
                      std::optional<int64_t> Stride = std::nullopt) {
   // FIXME: This should probably only return true for NUW.
   if (AR->getNoWrapFlags(SCEV::NoWrapMask))
@@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
   // case, the GEP would be  poison and any memory access dependent on it would
   // be immediate UB when executed.
   if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
-      GEP && GEP->hasNoUnsignedSignedWrap())
-    return true;
+      GEP && GEP->hasNoUnsignedSignedWrap()) {
+    // For the above reasoning to apply, the pointer must be dereferenced in
+    // every iteration.
+    if (L->getHeader() == L->getLoopLatch() ||
+        any_of(GEP->users(), [L, &DT, GEP](User *U) {
+          if (getLoadStorePointerOperand(U) != GEP)
+            return false;
+          BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+          return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT);
+        }))
+      return true;
+  }
 
   if (!Stride)
     Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
@@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess(
     }
 
     if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
-                  TheLoop, Assume))
+                  TheLoop, Assume, DT))
       return false;
   }
 
@@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() {
 /// Check whether the access through \p Ptr has a constant stride.
 std::optional<int64_t>
 llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-                   const Loop *Lp,
+                   const Loop *Lp, const DominatorTree &DT,
                    const DenseMap<Value *, const SCEV *> &StridesMap,
                    bool Assume, bool ShouldCheckWrap) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
@@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
   if (!ShouldCheckWrap || !Stride)
     return Stride;
 
-  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
+  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride))
     return Stride;
 
   LLVM_DEBUG(
@@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       BPtr->getType()->getPointerAddressSpace())
     return MemoryDepChecker::Dependence::Unknown;
 
-  std::optional<int64_t> StrideAPtr =
-      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
-  std::optional<int64_t> StrideBPtr =
-      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideAPtr = getPtrStride(
+      PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideBPtr = getPtrStride(
+      PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
   }
 
   MemoryDepChecker::DepCandidates DepCands;
-  AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
+  AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
+                          LoopAliasScopes);
 
   // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     bool IsReadOnlyPtr = false;
     Type *AccessTy = getLoadStoreType(LD);
     if (Seen.insert({Ptr, AccessTy}).second ||
-        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
+        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
+                      true)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 091d94843698c..977ed59e09243 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       // wrap around the address space we would do a memory access at nullptr
       // even without the transformation. The wrapping checks are therefore
       // deferred until after we've formed the interleaved groups.
-      int64_t Stride =
-        getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
-                     /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
+      int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
+                                    /*Assume=*/true, /*ShouldCheckWrap=*/false)
+                           .value_or(0);
 
       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
       AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
@@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
     assert(Member && "Group member does not exist");
     Value *MemberPtr = getLoadStorePointerOperand(Member);
     Type *AccessTy = getLoadStoreType(Member);
-    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
-                     /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
+    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
+                     /*Assume=*/false, /*ShouldCheckWrap=*/true)
+            .value_or(0))
       return false;
     LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                       << FirstOrLast
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 10f2c80edc1b3..197aae6e03cb1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -6207,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
 }
 
 static bool containsDecreasingPointers(Loop *TheLoop,
-                                       PredicatedScalarEvolution *PSE) {
+                                       PredicatedScalarEvolution *PSE,
+                                       const DominatorTree &DT) {
   const auto &Strides = DenseMap<Value *, const SCEV *>();
   for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the instructions in the block and look for addresses that are
@@ -6216,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
       if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
         Value *Ptr = getLoadStorePointerOperand(&I);
         Type *AccessTy = getLoadStoreType(&I);
-        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
-                         /*ShouldCheckWrap=*/false)
+        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
+                         /*Assume=*/true, /*ShouldCheckWrap=*/false)
                 .value_or(0) < 0)
           return true;
       }
@@ -6262,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
   // negative strides. This will require extra work to reverse the loop
   // predicate, which may be expensive.
   if (containsDecreasingPointers(TFI->LVL->getLoop(),
-                                 TFI->LVL->getPredicatedScalarEvolution()))
+                                 TFI->LVL->getPredicatedScalarEvolution(),
+                                 *TFI->LVL->getDominatorTree()))
     Required |= TailFoldingOpts::Reverse;
   if (Required == TailFoldingOpts::Disabled)
     Required |= TailFoldingOpts::Simple;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9b250e6cac3ab..24f58a68c345d 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
 //
 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const DataLayout &DL,
-                                 const LoopAccessInfo *LAI) {
+                                 const LoopAccessInfo *LAI,
+                                 const DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
 
   // If there are live-out values, it is probably a reduction. We can predicate
@@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
         Value *Ptr = getLoadStorePointerOperand(&I);
         Type *AccessTy = getLoadStoreType(&I);
-        int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
+        int64_t NextStride =
+            getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
         if (NextStride == 1) {
           // TODO: for now only allow consecutive strides of 1. We could support
           // other strides as long as it is uniform, but let's keep it simple
@@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
     return false;
   }
 
-  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
+                              *LVL->getDominatorTree());
 }
 
 TailFoldingStyle
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a8839981e5478..1b770be3909a9 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
   /// Return true if the dependence from the store to the load has an
   /// absolute distance of one.
   /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
-  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
-                                 Loop *L) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+                                 const DominatorTree &DT) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
                DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
            "Should be a known dependence");
 
-    int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
-    int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+    int64_t StrideLoad =
+        getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+    int64_t StrideStore =
+        getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
     if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
       return false;
 
@@ -287,8 +289,8 @@ class LoadEliminationForLoop {
         // so deciding which one forwards is easy.  The later one forwards as
         // long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(PSE, L) &&
-            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+            Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -538,7 +540,7 @@ class LoadEliminationForLoop {
 
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(PSE, L))
+      if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
         continue;
 
       assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16132093..03112c67dda7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
 
   bool CanAddPredicate = !llvm::shouldOptimizeForSize(
       TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
-  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
-                            CanAddPredicate, false).value_or(0);
+  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
+                            CanAddPredicate, false)
+                   .value_or(0);
   if (Stride == 1 || Stride == -1)
     return Stride;
   return 0;
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
index 4c2a9c3f29f02..d90a97f1651e6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
@@ -10,7 +10,7 @@
 ;   s0 += (1ULL << 62) + 1;
 ;   s1 += (1ULL << 62) + 2;
 ; }
-; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
+; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
 ; there are no dependences), as the pointers are not dereferenced in all loop iterations.
 define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
 ; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block'
@@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
@@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
@@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 89819f2be4967..1cbec47d72203 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-LABEL: @foo4(
 ; RV32-NEXT:  entry:
 ; RV32-NEXT:    br label [[VECTOR_MEMCHECK:%.*]]
+; RV32:       vector.scevcheck:
+; RV32-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624)
+; RV32-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV32-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV32-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]]
+; RV32-NEXT:    [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]]
+; RV32-NEXT:    [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]]
+; RV32-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624)
+; RV32-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; RV32-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]]
+; RV32-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]]
+; RV32-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]]
+; RV32-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
+; RV32-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
 ; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
-; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752
+; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; RV32-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
 ; RV32-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
 ; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
-; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
@@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; RV32-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
 ; RV32-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
 ; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]]
 ; RV32-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
 ; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
 ; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
 ; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; RV32:       middle.block:
 ; RV32-NEXT:    br label [[FOR_END:%.*]]
 ; RV32:       scalar.ph:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
 ; RV32-NEXT:    br label [[FOR_BODY:%.*]]
 ; RV32:       for.body:
-; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
 ; RV32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
@@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32:       for.inc:
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV32-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RV32:       for.end:
 ; RV32-NEXT:    ret void
 ;
@@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64:       for.inc:
 ; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV64-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
 ; RV64:       for.end:
 ; RV64-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index efcc0005acaa3..f9570405ecabc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = mul i32 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if7:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED:       pred.load.if15:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue16:
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8

From 37825ad4f6c5d7477fa1f8ed8b00aec186a8ceb9 Mon Sep 17 00:00:00 2001
From: Ivan Kelarev <ivan.kelarev@intel.com>
Date: Tue, 4 Nov 2025 09:20:01 -0800
Subject: [PATCH 192/313] [LoopUnroll] Prevent LoopFullUnrollPass from
 performing partial unrolling when trip counts are unknown (#165013)

Currently, `LoopFullUnrollPass` incorrectly performs partial unrolling
when `#pragma unroll` is specified and both `TripCount` and
`MaxTripCount` are unknown. This patch adds a check to prevent partial
unrolling when `OnlyFullUnroll` parameter is true and both trip count
values are zero.
---
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |  3 +-
 .../LoopUnroll/full-unroll-avoid-partial.ll   | 29 +++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2bda9d83236e8..802ae4e9c28e3 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   }
 
   // Do not attempt partial/runtime unrolling in FullLoopUnrolling
-  if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
+  if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
+                         UP.Count < TripCount || UP.Count < MaxTripCount)) {
     LLVM_DEBUG(
         dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
     return LoopUnrollResult::Unmodified;
diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
index 7f266a754d1bc..314cf38baae04 100644
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
@@ -85,6 +85,35 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !3
 }
 
+; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body
+; LOOP-UNROLL-NEXT: Loop Size = 4
+; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1
+; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop:
+; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><exiting>,%for.cond<latch>
+; LOOP-UNROLL-NEXT: Using epilog remainder.
+; LOOP-UNROLL-NEXT: Loop latch not terminated by a conditional branch.
+; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5!
+
+; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body
+; LOOP-UNROLL-FULL-NEXT: Loop Size = 4
+; LOOP-UNROLL-FULL-NEXT: Not attempting partial/runtime unroll in FullLoopUnroll
+define void @pragma_unroll_count2(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond, %entry
+  %i = phi i64 [ 0, %entry ], [ %inc, %for.cond ]
+  %cmp = icmp ult i64 %i, %n
+  br i1 %cmp, label %for.cond, label %for.cond.cleanup
+
+for.cond:                                         ; preds = %for.body
+  %inc = add i64 %i, 8
+  br label %for.body, !llvm.loop !3
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
 ; LOOP-UNROLL: llvm.loop.unroll.disable
 ; LOOP-UNROLL-FULL: llvm.loop.unroll.enable
 !0 = !{!"llvm.loop.unroll.enable"}

From 67ce4aba26172cda8fed90077e3393e98c22d2d2 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 4 Nov 2025 09:24:27 -0800
Subject: [PATCH 193/313] [llvm][mustache] Use single pass when tokenizing
 (#159196)

The old implementation used many string searches over the same portions
of the strings. This version sacrifices some API niceness for perf wins.

| Metric | Baseline | Single-Pass | Change |
| --- | --- | --- | --- |
| Time (ms) | 36\.09 | 35\.78 | \-0.86% |
| Cycles | 35\.3M | 35\.0M | \-0.79% |
| Instructions | 86\.7M | 85\.8M | \-1.03% |
| Branch Misses | 116K | 114K | \-1.91% |
| Cache Misses | 244K | 232K | \-4.98% |
---
 llvm/lib/Support/Mustache.cpp | 184 +++++++++++++---------------------
 1 file changed, 71 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 708e79d39cd21..24e3105c5e8a9 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -368,141 +368,99 @@ struct Tag {
   llvm_unreachable("Unknown json::Value::Kind");
 }
 
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
-                       StringRef Close) {
-  const StringLiteral TripleOpen("{{{");
-  const StringLiteral TripleClose("}}}");
-
-  size_t NormalOpenPos = Template.find(Open, StartPos);
-  size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
-  Tag Result;
-
-  // Determine which tag comes first.
-  if (TripleOpenPos != StringRef::npos &&
-      (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
-    // Found a triple mustache tag.
-    size_t EndPos =
-        Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Triple;
-    Result.StartPosition = TripleOpenPos;
-    size_t ContentStart = TripleOpenPos + TripleOpen.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch = Template.substr(
-        TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
-  } else if (NormalOpenPos != StringRef::npos) {
-    // Found a normal mustache tag.
-    size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Normal;
-    Result.StartPosition = NormalOpenPos;
-    size_t ContentStart = NormalOpenPos + Open.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch =
-        Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
-  }
-
-  return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
-  LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
-                    << ", Kind: " << tagKindToString(T.TagKind) << "\n");
-  if (T.TagKind == Tag::Kind::Triple) {
-    Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
-    return std::nullopt;
-  }
-  StringRef Interpolated = T.Content;
-  if (!Interpolated.trim().starts_with("=")) {
-    char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
-    Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
-    return std::nullopt;
-  }
-  Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
-  StringRef DelimSpec = Interpolated.trim();
-  DelimSpec = DelimSpec.drop_front(1);
-  DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
-  DelimSpec = DelimSpec.trim();
-
-  std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
-  LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
-                    << ", NewClose: " << Ret.second << "\n");
-  return Ret;
-}
-
 // Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
 static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
   SmallString<8> Close("}}");
-  size_t Start = 0;
+  size_t Cursor = 0;
+  size_t TextStart = 0;
 
-  while (Start < Template.size()) {
-    LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
-                      << "', Close:'" << Close << "'\n");
-    Tag T = findNextTag(Template, Start, Open, Close);
+  const StringLiteral TripleOpen("{{{");
+  const StringLiteral TripleClose("}}}");
 
-    if (T.TagKind == Tag::Kind::None) {
-      // No more tags, the rest is text.
-      Tokens.emplace_back(Template.substr(Start));
-      break;
+  while (Cursor < Template.size()) {
+    StringRef TemplateSuffix = Template.substr(Cursor);
+    StringRef TagOpen, TagClose;
+    Tag::Kind Kind;
+
+    // Determine which tag we've encountered.
+    if (TemplateSuffix.starts_with(TripleOpen)) {
+      Kind = Tag::Kind::Triple;
+      TagOpen = TripleOpen;
+      TagClose = TripleClose;
+    } else if (TemplateSuffix.starts_with(Open)) {
+      Kind = Tag::Kind::Normal;
+      TagOpen = Open;
+      TagClose = Close;
+    } else {
+      // Not at a tag, continue scanning.
+      ++Cursor;
+      continue;
     }
 
-    // Add the text before the tag.
-    if (T.StartPosition > Start) {
-      StringRef Text = Template.substr(Start, T.StartPosition - Start);
-      Tokens.emplace_back(Text);
+    // Found a tag, first add the preceding text.
+    if (Cursor > TextStart)
+      Tokens.emplace_back(Template.slice(TextStart, Cursor));
+
+    // Find the closing tag.
+    size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+    if (EndPos == StringRef::npos) {
+      // No closing tag, the rest is text.
+      Tokens.emplace_back(Template.substr(Cursor));
+      TextStart = Cursor = Template.size();
+      break;
     }
 
-    if (auto NewDelims = processTag(T, Tokens, Ctx)) {
-      std::tie(Open, Close) = *NewDelims;
+    // Extract tag content and full match.
+    size_t ContentStart = Cursor + TagOpen.size();
+    StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+    StringRef FullMatch =
+        Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+    // Process the tag (inlined logic from processTag).
+    LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+                      << ", Kind: " << tagKindToString(Kind) << "\n");
+    if (Kind == Tag::Kind::Triple) {
+      Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+    } else { // Normal Tag
+      StringRef Interpolated = Content;
+      if (!Interpolated.trim().starts_with("=")) {
+        char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+        Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+      } else { // Set Delimiter
+        Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+        StringRef DelimSpec = Interpolated.trim();
+        DelimSpec = DelimSpec.drop_front(1);
+        DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+        DelimSpec = DelimSpec.trim();
+
+        auto [NewOpen, NewClose] = DelimSpec.split(' ');
+        LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+                          << ", NewClose: " << NewClose << "\n");
+        Open = NewOpen;
+        Close = NewClose;
+      }
     }
 
-    // Move past the tag.
-    Start = T.StartPosition + T.FullMatch.size();
+    // Move past the tag for the next iteration.
+    Cursor += FullMatch.size();
+    TextStart = Cursor;
   }
 
-  // Fix up white spaces for:
-  //   - open sections
-  //   - inverted sections
-  //   - close sections
-  //   - comments
-  //
-  // This loop attempts to find standalone tokens and tries to trim out
-  // the surrounding whitespace.
-  // For example:
-  // if you have the template string
-  //  {{#section}} \n Example \n{{/section}}
-  // The output should would be
-  // For example:
-  //  \n Example \n
+  // Add any remaining text after the last tag.
+  if (TextStart < Template.size())
+    Tokens.emplace_back(Template.substr(TextStart));
+
+  // Fix up white spaces for standalone tags.
   size_t LastIdx = Tokens.size() - 1;
   for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
     Token &CurrentToken = Tokens[Idx];
     Token::Type CurrentType = CurrentToken.getType();
-    // Check if token type requires cleanup.
-    bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
-    if (!RequiresCleanUp)
+    if (!requiresCleanUp(CurrentType))
       continue;
 
-    // We adjust the token body if there's no text behind or ahead.
-    // A token is considered to have no text ahead if the right of the previous
-    // token is a newline followed by spaces.
-    // A token is considered to have no text behind if the left of the next
-    // token is spaces followed by a newline.
-    // eg.
-    //  "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
     bool HasTextBehind = hasTextBehind(Idx, Tokens);
     bool HasTextAhead = hasTextAhead(Idx, Tokens);
 

From 0ce03c2be4c43d19e4b63d805b13838f56621f3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 4 Nov 2025 07:33:09 -1000
Subject: [PATCH 194/313] [flang][cuda] Add interface and lowering for
 atomicadd_r4x2 and atomicadd_r4x4 (#166308)

---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |  1 +
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 71 +++++++++++--------
 flang/module/cudadevice.f90                   | 20 +++++-
 flang/test/Lower/CUDA/cuda-atomicadd.cuf      | 24 +++++--
 4 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index bbdef481a2085..b64419f5ae6da 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -190,6 +190,7 @@ struct IntrinsicLibrary {
   mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAtomicAddR2(mlir::Type,
                                     llvm::ArrayRef<fir::ExtendedValue>);
+  template <int extent>
   fir::ExtendedValue genAtomicAddVector(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index b9ea8b125b780..3156c8cb4332c 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -290,12 +290,12 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
-    {"atomicadd_r2x2",
-     &I::genAtomicAddVector,
+    {"atomicadd_r4x2",
+     &I::genAtomicAddVector<2>,
      {{{"a", asAddr}, {"v", asAddr}}},
      false},
-    {"atomicadd_r4x2",
-     &I::genAtomicAddVector,
+    {"atomicadd_r4x4",
+     &I::genAtomicAddVector<4>,
      {{{"a", asAddr}, {"v", asAddr}}},
      false},
     {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
@@ -306,6 +306,14 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAtomicAddR2,
      {{{"a", asAddr}, {"v", asAddr}}},
      false},
+    {"atomicaddvector_r2x2",
+     &I::genAtomicAddVector<2>,
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r4x2",
+     &I::genAtomicAddVector<2>,
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
     {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
     {"atomiccasd",
      &I::genAtomicCas,
@@ -3176,44 +3184,51 @@ IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
                                          mlir::ArrayRef<int64_t>{0});
 }
 
+template <int extent>
 fir::ExtendedValue
 IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType,
                                      llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
   mlir::Value res = fir::AllocaOp::create(
-      builder, loc, fir::SequenceType::get({2}, resultType));
+      builder, loc, fir::SequenceType::get({extent}, resultType));
   mlir::Value a = fir::getBase(args[0]);
   if (mlir::isa<fir::BaseBoxType>(a.getType())) {
     a = fir::BoxAddrOp::create(builder, loc, a);
   }
-  auto vecTy = mlir::VectorType::get({2}, resultType);
+  auto vecTy = mlir::VectorType::get({extent}, resultType);
   auto refTy = fir::ReferenceType::get(resultType);
   mlir::Type i32Ty = builder.getI32Type();
   mlir::Type idxTy = builder.getIndexType();
-  mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
-  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-  mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy,
-                                                  fir::getBase(args[1]), zero);
-  mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy,
-                                                  fir::getBase(args[1]), one);
-  mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord);
-  mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord);
+
+  // Extract the values from the array.
+  llvm::SmallVector<mlir::Value> values;
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), pos);
+    mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+    values.push_back(value);
+  }
+  // Pack extracted values into a vector to call the atomic add.
   mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
-  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
-      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
-  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
-      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value insert = mlir::LLVM::InsertElementOp::create(
+        builder, loc, undef, values[i],
+        builder.createIntegerConstant(loc, i32Ty, i));
+    undef = insert;
+  }
+  // Atomic operation with a vector of values.
   mlir::Value add =
-      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
-  mlir::Value r1 = mlir::LLVM::ExtractElementOp::create(
-      builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0));
-  mlir::Value r2 = mlir::LLVM::ExtractElementOp::create(
-      builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1));
-  mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero);
-  mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one);
-  fir::StoreOp::create(builder, loc, r1, c1);
-  fir::StoreOp::create(builder, loc, r2, c2);
-  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2);
+      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
+  // Store results in the result array.
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value r = mlir::LLVM::ExtractElementOp::create(
+        builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
+    mlir::Value c = fir::CoordinateOp::create(
+        builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
+    fir::StoreOp::create(builder, loc, r, c);
+  }
+  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
   return fir::ArrayBoxValue(res, {ext});
 }
 
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index b1aef95cba8c9..27097193aaa9b 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1179,13 +1179,22 @@ attributes(device) pure integer(4) function atomicaddr2(address, val)
   end interface
 
   interface atomicaddvector
-    attributes(device) pure function atomicadd_r2x2(address, val) result(z)
+    attributes(device) pure function atomicaddvector_r2x2(address, val) result(z)
       !dir$ ignore_tkr (rd) address, (d) val
       real(2), dimension(2), intent(inout) :: address
       real(2), dimension(2), intent(in) :: val
       real(2), dimension(2) :: z
     end function
 
+    attributes(device) pure function atomicaddvector_r4x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(2), intent(inout) :: address
+      real(4), dimension(2), intent(in) :: val
+      real(4), dimension(2) :: z
+    end function
+  end interface
+
+  interface atomicaddreal4x2
     attributes(device) pure function atomicadd_r4x2(address, val) result(z)
       !dir$ ignore_tkr (rd) address, (d) val
       real(4), dimension(2), intent(inout) :: address
@@ -1194,6 +1203,15 @@ attributes(device) pure function atomicadd_r4x2(address, val) result(z)
     end function
   end interface
 
+  interface atomicaddreal4x4
+    attributes(device) pure function atomicadd_r4x4(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(4), intent(inout) :: address
+      real(4), dimension(4), intent(in) :: val
+      real(4), dimension(4) :: z
+    end function
+  end interface
+
   interface atomicsub
     attributes(device) pure integer function atomicsubi(address, val)
   !dir$ ignore_tkr (d) address, (d) val
diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
index baa6cdb3d5869..6669b4afa291d 100644
--- a/flang/test/Lower/CUDA/cuda-atomicadd.cuf
+++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
@@ -2,18 +2,34 @@
 
 ! Test CUDA Fortran atmoicadd functions available cudadevice module
 
-attributes(global) subroutine atomicaddvector_r2()
+attributes(global) subroutine test_atomicaddvector_r2()
   real(2), device :: a(2), tmp1(2), tmp2(2)
   tmp1 = atomicAddVector(a, tmp2)
 end subroutine
 
-! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
 
-attributes(global) subroutine atomicaddvector_r4()
+attributes(global) subroutine test_atomicaddvector_r4()
   real(4), device :: a(2), tmp1(2), tmp2(2)
   tmp1 = atomicAddVector(a, tmp2)
 end subroutine
 
-! CHECK-LABEL: func.func @_QPatomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 ! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>
+
+attributes(global) subroutine test_atomicadd_r2x4()
+  real(4), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicaddreal4x2(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>
+
+attributes(global) subroutine test_atomicadd_r4x4()
+  real(4), device :: a(4), tmp1(4), tmp2(4)
+  tmp1 = atomicaddreal4x4(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32>

From 6601c384d36e050ae007c691b2d0b9c479edbdf6 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Tue, 4 Nov 2025 09:36:54 -0800
Subject: [PATCH 195/313] Fix getting section info in large mach-o files.
 (#165940)

Mach-o has 32 bit file offsets in the MachO::section_64 structs. dSYM
files can contain sections whose start offset exceeds UINT32_MAX, which
means the MachO::section_64.offset will get truncated. We can calculate
when this happens and properly adjust the section offset to be 64 bit
safe. This means tools can get the correct section contents for large
dSYM files and allows tools that parse DWARF, like llvm-gsymutil, to be
able to load and convert these files correctly.
---
 llvm/include/llvm/Object/MachO.h    |  2 +-
 llvm/lib/Object/MachOObjectFile.cpp | 26 ++++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 01e7c6b07dd36..f4c1e30b097ee 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -447,7 +447,7 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
   uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
-  ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const;
+  ArrayRef<uint8_t> getSectionContents(uint64_t Offset, uint64_t Size) const;
   Expected<ArrayRef<uint8_t>>
   getSectionContents(DataRefImpl Sec) const override;
   uint64_t getSectionAlignment(DataRefImpl Sec) const override;
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index e09dc947c2779..c2f4560c06c0d 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
   return SectSize;
 }
 
-ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset,
+ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset,
                                                       uint64_t Size) const {
   return arrayRefFromStringRef(getData().substr(Offset, Size));
 }
 
 Expected<ArrayRef<uint8_t>>
 MachOObjectFile::getSectionContents(DataRefImpl Sec) const {
-  uint32_t Offset;
+  uint64_t Offset;
   uint64_t Size;
 
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Offset = Sect.offset;
     Size = Sect.size;
+    // Check for large mach-o files where the section contents might exceed
+    // 4GB. MachO::section_64 objects only have 32 bit file offsets to the
+    // section contents and can overflow in dSYM files. We can track this and
+    // adjust the section offset to be 64 bit safe. If sections overflow then
+    // section ordering is enforced. If sections are not ordered, then an error
+    // will be returned stopping invalid section data from being returned.
+    uint64_t PrevTrueOffset = 0;
+    uint64_t SectOffsetAdjust = 0;
+    for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) {
+      MachO::section_64 CurrSect =
+          getStruct<MachO::section_64>(*this, Sections[SectIdx]);
+      uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust;
+      if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset))
+        return malformedError("section data exceeds 4GB and section file "
+                              "offsets are not ordered");
+      const uint64_t EndSectFileOffset =
+          (uint64_t)CurrSect.offset + CurrSect.size;
+      if (EndSectFileOffset > UINT32_MAX)
+        SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull;
+      PrevTrueOffset = CurrTrueOffset;
+    }
+    Offset += SectOffsetAdjust;
   } else {
     MachO::section Sect = getSection(Sec);
     Offset = Sect.offset;

From ecd67a7a95b3ec1a8798ff8a50d6668ec0106a28 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 4 Nov 2025 18:40:42 +0100
Subject: [PATCH 196/313] [CIR] Upstream CXXDefaultArgExpr for AggregateExpr
 (#165991)

Upstream the CXXDefaultArgExpr support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  4 +-
 clang/test/CIR/CodeGen/struct.cpp             | 44 +++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 3d3030ca87e2a..201fb73983155 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -343,8 +343,8 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr");
   }
   void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
-    cgf.cgm.errorNYI(dae->getSourceRange(),
-                     "AggExprEmitter: VisitCXXDefaultArgExpr");
+    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
+    Visit(dae->getExpr());
   }
   void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index c8db71498e477..ee543001025e7 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -344,3 +344,47 @@ void struct_with_const_member_expr() {
 // OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0
 // OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4
 // OGCG: store i32 0, ptr %[[A_ADDR]], align 4
+
+void function_arg_with_default_value(CompleteS a = {1, 2}) {}
+
+// CIR: %[[ARG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR: cir.store %{{.*}}, %[[ARG_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+
+// LLVM: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: store %struct.CompleteS %{{.*}}, ptr %[[ARG_ADDR]], align 4
+
+// OGCG: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: store i64 %{{.*}}, ptr %[[ARG_ADDR]], align 4
+
+void calling_function_with_default_values() {
+  function_arg_with_default_value();
+}
+
+// CIR: %[[AGG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp0"]
+// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[AGG_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[AGG_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_2_I8:.*]] = cir.cast integral %[[CONST_2]] : !s32i -> !s8i
+// CIR: cir.store{{.*}} %[[CONST_2_I8]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+// CIR: %[[TMP_AGG:.*]] = cir.load{{.*}} %[[AGG_ADDR]] : !cir.ptr<!rec_CompleteS>, !rec_CompleteS
+// CIR: cir.call @_Z31function_arg_with_default_value9CompleteS(%[[TMP_AGG]]) : (!rec_CompleteS) -> ()
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+
+// LLVM: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
+// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
+// LLVM: store i8 2, ptr %[[ELEM_1_PTR]], align 4
+// LLVM: %[[TMP_AGG:.*]] = load %struct.CompleteS, ptr %[[AGG_ADDR]], align 4
+// LLVM: call void @_Z31function_arg_with_default_value9CompleteS(%struct.CompleteS %[[TMP_AGG]])
+
+// OGCG: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
+// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
+// OGCG: store i8 2, ptr %[[ELEM_1_PTR]], align 4
+// OGCG: %[[TMP_AGG:.*]] = load i64, ptr %[[AGG_ADDR]], align 4
+// OGCG: call void @_Z31function_arg_with_default_value9CompleteS(i64 %[[TMP_AGG]])

From a02e5740119a4d13542126b124f2c464b23738d4 Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com>
Date: Tue, 4 Nov 2025 18:41:01 +0100
Subject: [PATCH 197/313] [libc] Add faccessat entrypoints for aarch64 and
 riscv (#165869)

Add faccessat entrypoints for aarch64 and riscv linux. Entrypoints are
removed if faccessat2 syscall is not available.
---
 libc/config/linux/aarch64/entrypoints.txt | 3 +--
 libc/config/linux/aarch64/exclude.txt     | 8 ++++++++
 libc/config/linux/riscv/entrypoints.txt   | 1 +
 libc/config/linux/riscv/exclude.txt       | 8 ++++++++
 libc/config/linux/x86_64/exclude.txt      | 1 +
 5 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 libc/config/linux/aarch64/exclude.txt
 create mode 100644 libc/config/linux/riscv/exclude.txt

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 714120a79e39a..e0dd15b803253 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -325,8 +325,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
-    # Disabled while SYS_faccessat2 is unavailable on the buildbot.
-    # libc.src.unistd.faccessat
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
diff --git a/libc/config/linux/aarch64/exclude.txt b/libc/config/linux/aarch64/exclude.txt
new file mode 100644
index 0000000000000..f2f553f78933c
--- /dev/null
+++ b/libc/config/linux/aarch64/exclude.txt
@@ -0,0 +1,8 @@
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index f6bbb346d10e5..0d031d8844f13 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -329,6 +329,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
diff --git a/libc/config/linux/riscv/exclude.txt b/libc/config/linux/riscv/exclude.txt
new file mode 100644
index 0000000000000..f2f553f78933c
--- /dev/null
+++ b/libc/config/linux/riscv/exclude.txt
@@ -0,0 +1,8 @@
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt
index a0686310d21ac..31b60a9c3497c 100644
--- a/libc/config/linux/x86_64/exclude.txt
+++ b/libc/config/linux/x86_64/exclude.txt
@@ -23,6 +23,7 @@ endif()
 include(CheckSymbolExists)
 check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
 if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
   list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
     libc.src.unistd.faccessat
   )

From 4ce58833d3653f0b15d5458b8430ec8cf25fdc16 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 5 Nov 2025 01:43:05 +0800
Subject: [PATCH 198/313] [SimplifyCFG] Fix value enumeration of a full range
 (#166379)

ConstantRange uses `[-1, -1)` as the canonical form of a full set.
Therefore, the `for (APInt I = Lower; I != Upper; ++I)` idiom doesn't
work for full ranges. This patch fixes the value enumeration in
`ConstantComparesGatherer` to prevent missing values for full sets.

Closes https://github.com/llvm/llvm-project/issues/166369.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp    |  4 ++-
 llvm/test/Transforms/SimplifyCFG/pr166369.ll | 37 ++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/pr166369.ll

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 532511dcf91b0..3a3e3ade20212 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -778,8 +778,10 @@ struct ConstantComparesGatherer {
       return false;
 
     // Add all values from the range to the set
-    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+    APInt Tmp = Span.getLower();
+    do
       Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+    while (++Tmp != Span.getUpper());
 
     UsedICmps++;
     return true;
diff --git a/llvm/test/Transforms/SimplifyCFG/pr166369.ll b/llvm/test/Transforms/SimplifyCFG/pr166369.ll
new file mode 100644
index 0000000000000..c0a85c0293dd8
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr166369.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s
+
+; Make sure we handle full-set ranges correctly.
+define void @test_i1() {
+; CHECK-LABEL: define void @test_i1() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp ugt i1 false, true
+  br label %bb5
+
+bb5:
+  %select = select i1 %icmp, i1 %icmp, i1 false
+  br i1 %select, label %bb5, label %bb6
+
+bb6:
+  ret void
+}
+
+define void @test_i3() {
+; CHECK-LABEL: define void @test_i3() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp ugt i3 0, 7
+  br label %bb5
+
+bb5:
+  %select = select i1 %icmp, i1 %icmp, i1 false
+  br i1 %select, label %bb5, label %bb6
+
+bb6:
+  ret void
+}

From 39221718519f2ea3710cc3f5940adb13639b4f80 Mon Sep 17 00:00:00 2001
From: alessandra simmons <alessandra@adrs.pub>
Date: Tue, 4 Nov 2025 12:45:55 -0500
Subject: [PATCH 199/313] [clang][Driver][HIP] Change OffloadingActionBuilder
 to respect the --no-gpu-bundle-output flag (#163834)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, the command `clang -c -emit-llvm --no-gpu-bundle-output
--offload-arch=gfx900,gfx1030 -O3 -x hip square.hip` will lead to a
bundled output:
```
❯ ../bin/clang -c -emit-llvm --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip square.hip
❯ ls
square.hip
square.bc
```

This doesn't match my expectation of the behavior of
`--no-gpu-bundle-output`, so this adds a check into
OffloadingActionBuilder for the flag when replacing the host compile
action for a bundling action.
---
 clang/lib/Driver/Driver.cpp                   | 16 +++++++++----
 clang/test/Driver/no-gpu-bundle-respected.hip | 24 +++++++++++++++++++
 2 files changed, 35 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Driver/no-gpu-bundle-respected.hip

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 6f6a35b4c8c17..a0b82cec9a372 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3857,6 +3857,9 @@ class OffloadingActionBuilder final {
   /// Flag set to true if all valid builders allow file bundling/unbundling.
   bool CanUseBundler;
 
+  /// Flag set to false if an argument turns off bundling.
+  bool ShouldUseBundler;
+
 public:
   OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
                           const Driver::InputList &Inputs)
@@ -3891,6 +3894,9 @@ class OffloadingActionBuilder final {
     }
     CanUseBundler =
         ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling;
+
+    ShouldUseBundler = Args.hasFlag(options::OPT_gpu_bundle_output,
+                                    options::OPT_no_gpu_bundle_output, true);
   }
 
   ~OffloadingActionBuilder() {
@@ -4042,11 +4048,11 @@ class OffloadingActionBuilder final {
       SB->appendTopLevelActions(OffloadAL);
     }
 
-    // If we can use the bundler, replace the host action by the bundling one in
-    // the resulting list. Otherwise, just append the device actions. For
-    // device only compilation, HostAction is a null pointer, therefore only do
-    // this when HostAction is not a null pointer.
-    if (CanUseBundler && HostAction &&
+    // If we can and should use the bundler, replace the host action by the
+    // bundling one in the resulting list. Otherwise, just append the device
+    // actions. For device only compilation, HostAction is a null pointer,
+    // therefore only do this when HostAction is not a null pointer.
+    if (CanUseBundler && ShouldUseBundler && HostAction &&
         HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) {
       // Add the host action to the list in order to create the bundling action.
       OffloadAL.push_back(HostAction);
diff --git a/clang/test/Driver/no-gpu-bundle-respected.hip b/clang/test/Driver/no-gpu-bundle-respected.hip
new file mode 100644
index 0000000000000..fc93640dc4b90
--- /dev/null
+++ b/clang/test/Driver/no-gpu-bundle-respected.hip
@@ -0,0 +1,24 @@
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
+
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
+
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefixes=COMPILER,GFX1030,GFX900,OFFLOAD,NOBUNDLE
+
+// BUNDLE: clang-offload-bundler
+// NOBUNDLE-NOT: clang-offload-bundler
+
+// COM: sanity checks
+// COMPILER: compiler
+// GFX1030: (device-hip, gfx1030)
+// GFX900: (device-hip, gfx900)
+// OFFLOAD: offload
+
+int square(int num) {
+    return num * num;
+}

From 1aa86ca521b8fea5ff728945d5ea5cdef97a6250 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Tue, 4 Nov 2025 12:49:33 -0500
Subject: [PATCH 200/313] [LoopUnroll] Fix division by zero (#166258)

PR #159163's probability computation for epilogue loops does not handle
the possibility of an original loop probability of one. Runtime loop
unrolling does not make sense for such an infinite loop, and a division
by zero results. This patch works around that case.

Issue #165998.
---
 .../Transforms/Utils/LoopUnrollRuntime.cpp    |  21 ++++
 .../LoopUnroll/loop-probability-one.ll        | 116 ++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopUnroll/loop-probability-one.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 1e8f6cc76900c..6c9467bf4a005 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
 /// probability of executing at least one more iteration?
 static BranchProbability
 probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+  // OriginalLoopProb == 1 would produce a division by zero in the calculation
+  // below.  The problem is that case indicates an always infinite loop, but a
+  // remainder loop cannot be calculated at run time if the original loop is
+  // infinite as infinity % UnrollCount is undefined.  We then choose
+  // probabilities indicating that all remainder loop iterations will always
+  // execute.
+  //
+  // Currently, the remainder loop here is an epilogue, which cannot be reached
+  // if the original loop is infinite, so the aforementioned choice is
+  // arbitrary.
+  //
+  // FIXME: Branch weights still need to be fixed in the case of prologues
+  // (issue #135812).  In that case, the aforementioned choice seems reasonable
+  // for the goal of maintaining the original loop's block frequencies.  That
+  // is, an infinite loop's initial iterations are not skipped, and the prologue
+  // loop body might have unique blocks that execute a finite number of times
+  // if, for example, the original loop body contains conditionals like i <
+  // UnrollCount.
+  if (OriginalLoopProb == BranchProbability::getOne())
+    return BranchProbability::getOne();
+
   // Each of these variables holds the original loop's probability that the
   // number of iterations it will execute is some m in the specified range.
   BranchProbability ProbOne = OriginalLoopProb;                // 1 <= m
diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
new file mode 100644
index 0000000000000..14f6da42df6b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
@@ -0,0 +1,116 @@
+; Check that a loop probability of one (indicating an always infinite loop) does
+; not crash or otherwise break LoopUnroll behavior when it tries to compute new
+; probabilities from it.
+;
+; That case indicates an always infinite loop.  A remainder loop cannot be
+; calculated at run time when the original loop is infinite as infinity %
+; UnrollCount is undefined, so consistent remainder loop probabilities are
+; difficult or impossible to reason about.  The implementation chooses
+; probabilities indicating that all remainder loop iterations will always
+; execute.
+
+; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S
+; DEFINE: %{rt} = %{unroll} -unroll-runtime
+
+; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL
+; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG
+; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG
+
+define void @test(i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %i, 1
+  %c = icmp slt i32 %inc, %n
+  br i1 %c, label %loop, label %end, !prof !0
+
+end:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 1, i32 0}
+
+; UNROLL: define void @test(i32 %n) {
+; UNROLL: entry:
+; UNROLL:   br label %loop
+; UNROLL: loop:
+; UNROLL:   br i1 %c, label %loop.1, label %end, !prof !0
+; UNROLL: loop.1:
+; UNROLL:   br i1 %c.1, label %loop.2, label %end, !prof !0
+; UNROLL: loop.2:
+; UNROLL:   br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1
+; UNROLL-NOT: loop.3
+; UNROLL: end:
+; UNROLL:   ret void
+; UNROLL: }
+;
+; Infinite unrolled loop.
+; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0}
+
+; EPILOG: define void @test(i32 %n) {
+; EPILOG: entry:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0
+; EPILOG: entry.new:
+; EPILOG:   br label %loop
+; EPILOG: loop:
+; EPILOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1
+; EPILOG: end.unr-lcssa:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1
+; EPILOG: loop.epil.preheader:
+; EPILOG:   br label %loop.epil
+; EPILOG: loop.epil:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4
+; EPILOG: end.epilog-lcssa:
+; EPILOG:   br label %end
+; EPILOG: end:
+; EPILOG:   ret void
+; EPILOG: }
+;
+; Unrolled loop guard: Unrolled loop is always entered.
+; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Unrolled loop latch: Unrolled loop is infinite.
+; Epilogue loop guard: Epilogue loop is always entered if unrolled loop exits.
+; EPILOG: !1 = !{!"branch_weights", i32 -2147483648, i32 0}
+;
+; Epilogue loop latch: Epilogue loop executes both of its 2 iterations.
+; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824}
+
+; PROLOG: define void @test(i32 %n) {
+; PROLOG: entry:
+; PROLOG:   br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0
+; PROLOG: loop.prol.preheader:
+; PROLOG:   br label %loop.prol
+; PROLOG: loop.prol:
+; PROLOG:   br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1
+; PROLOG: loop.prol.loopexit.unr-lcssa:
+; PROLOG:   br label %loop.prol.loopexit
+; PROLOG: loop.prol.loopexit:
+; PROLOG:   br i1 %{{.*}}, label %end, label %entry.new, !prof !0
+; PROLOG: entry.new:
+; PROLOG:   br label %loop
+; PROLOG: loop:
+; PROLOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4
+; PROLOG: end.unr-lcssa:
+; PROLOG:   br label %end
+; PROLOG: end:
+; PROLOG:   ret void
+; PROLOG: }
+;
+; FIXME: Branch weights still need to be fixed in the case of prologues (issue
+; #135812), so !0 and !1 do not yet match their comments below.  When we do
+; fix it, this test will hopefully catch any bug like issue #165998, which
+; impacted the case of epilogues.
+;
+; Prologue loop guard: Prologue loop is always entered.
+; Unrolled loop guard: Unrolled loop is always entered.
+; PROLOG: !0 = !{!"branch_weights", i32 1, i32 127}
+;
+; Prologue loop latch: Prologue loop executes both of its 2 iterations.
+; PROLOG: !1 = !{!"branch_weights", i32 0, i32 1}
+;
+; Unrolled loop latch: Unrolled loop is infinite.
+; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0}

From 1409db663139a644871362ffb23d725078bc84cf Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena <usx@google.com>
Date: Tue, 4 Nov 2025 12:53:58 -0500
Subject: [PATCH 201/313] [LifetimeSafety] Fix Python path for Windows
 compatibility (#166291)

Fix Python virtual environment paths for Windows in the Lifetime Safety
Analysis benchmark

### What changed?

- Added conditional path setting for the Python executable in the
virtual environment based on the platform
- For Windows, use `Scripts/python` path
- For other platforms, use `bin/python` path
- Updated the commands that use the Python virtual environment to use
the platform-specific path

### How to test?

`ninja benchmark_lifetime_safety_analysis`

Fixes #166143
---
 clang/test/Analysis/LifetimeSafety/CMakeLists.txt | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
index ce37a29655668..2f9c2ac247497 100644
--- a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
+++ b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
@@ -15,6 +15,13 @@ set(LIFETIME_BENCHMARK_REQUIREMENTS
 set(LIFETIME_BENCHMARK_OUTPUT_DIR
   "${CMAKE_CURRENT_BINARY_DIR}/benchmark_results")
 
+if(WIN32)
+  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
+    "${LIFETIME_BENCHMARK_VENV_DIR}/Scripts/python")
+else()
+  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
+    "${LIFETIME_BENCHMARK_VENV_DIR}/bin/python")
+endif()
 
 if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREMENTS})
 
@@ -22,7 +29,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
   add_custom_command(
     OUTPUT ${LIFETIME_BENCHMARK_VENV_DIR}/pyvenv.cfg
     COMMAND ${Python3_EXECUTABLE} -m venv ${LIFETIME_BENCHMARK_VENV_DIR}
-    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
+    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
     DEPENDS ${LIFETIME_BENCHMARK_REQUIREMENTS}
     COMMENT "Creating Python virtual environment and installing dependencies for benchmark..."
   )
@@ -32,7 +39,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
 
   # Main benchmark target
   add_custom_target(benchmark_lifetime_safety_analysis
-    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python ${LIFETIME_BENCHMARK_SCRIPT}
+    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} ${LIFETIME_BENCHMARK_SCRIPT}
             --clang-binary ${LLVM_BINARY_DIR}/bin/clang
             --output-dir ${LIFETIME_BENCHMARK_OUTPUT_DIR}
 

From 8aff0d99d6081355eeba2ed4e0a5484db0f4170b Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 4 Nov 2025 11:54:43 -0600
Subject: [PATCH 202/313] [test] Avoid writing test output to readonly dir
 (#166404)

Omitting `-o /dev/null` may try to write output to the current dir,
which may not have write permissions on some build systems.

This fixes the test added by #165737
---
 llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
index d3853e2fdaa88..4d81fdc67736d 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s
 
 ; Test that we get a clear error message when using an unsupported syncscope.
 

From 208b7360152bdc92a9089efaadd2167549b73908 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 4 Nov 2025 17:55:06 +0000
Subject: [PATCH 203/313] [X86] select-copy.mir - regenerate test checks. NFC
 (#166405)

---
 .../CodeGen/X86/GlobalISel/select-copy.mir    | 136 +++++++++---------
 1 file changed, 70 insertions(+), 66 deletions(-)

diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
index 41e1b5bf22bf1..5c059a4e0539d 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -1,5 +1,6 @@
-# RUN: llc -mtriple=i386-linux-gnu   -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=i386-linux-gnu   -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64
 
 --- |
 
@@ -30,24 +31,23 @@
 ...
 ---
 name:            test_copy
-# ALL-LABEL: name:  test_copy
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
-# ALL:          %0:gr8 = COPY $al
-# ALL-NEXT:     %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT:     $eax = COPY %1
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+    ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $al
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -56,24 +56,23 @@ body:             |
 ...
 ---
 name:            test_copy2
-# ALL-LABEL: name:  test_copy2
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
-# ALL:          %0:gr8 = COPY $al
-# ALL-NEXT:     %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT:     $eax = COPY %1
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy2
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+    ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $al
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -82,30 +81,35 @@ body:             |
 ...
 ---
 name:            test_copy3
-# ALL-LABEL: name:  test_copy3
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [   ] }
-# X32-NEXT:   - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [   ] }
-# X64-NEXT:   - { id: 1, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr16 = COPY $ax
-# X32-NEXT:     %3:gr16_abcd = COPY %0
-# X32-NEXT:     %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT:     %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; X86-LABEL: name: test_copy3
+    ; X86: liveins: $eax
+    ; X86-NEXT: {{  $}}
+    ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+    ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]]
+    ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+    ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+    ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X86-NEXT: RET 0, implicit $eax
+    ;
+    ; X64-LABEL: name: test_copy3
+    ; X64: liveins: $eax
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+    ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+    ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+    ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $ax
     %1(s8) = G_TRUNC %0(s16)
     %2(s32) = G_ZEXT %1(s8)
@@ -115,27 +119,25 @@ body:             |
 ...
 ---
 name:            test_copy4
-# ALL-LABEL: name:  test_copy4
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr16, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr32 = COPY $eax
-# ALL-NEXT:     %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr16 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy4
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+    ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $eax
     %1(s16) = G_TRUNC %0(s32)
     %2(s32) = G_ZEXT %1(s16)
@@ -145,30 +147,35 @@ body:             |
 ...
 ---
 name:            test_copy5
-# ALL-LABEL: name:  test_copy5
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [   ] }
-# X32-NEXT:   - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [   ] }
-# X64-NEXT:   - { id: 1, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr32 = COPY $edx
-# X32-NEXT:     %3:gr32_abcd = COPY %0
-# X32-NEXT:     %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT:     %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax,$edx
 
+    ; X86-LABEL: name: test_copy5
+    ; X86: liveins: $eax, $edx
+    ; X86-NEXT: {{  $}}
+    ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]]
+    ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+    ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+    ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X86-NEXT: RET 0, implicit $eax
+    ;
+    ; X64-LABEL: name: test_copy5
+    ; X64: liveins: $eax, $edx
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+    ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+    ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edx
     %1(s8) = G_TRUNC %0(s32)
     %2(s32) = G_ANYEXT %1(s8)
@@ -178,29 +185,26 @@ body:             |
 ...
 ---
 name:            test_copy6
-# ALL-LABEL: name:  test_copy6
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr16, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:         %0:gr32 = COPY $edx
-# ALL-NEXT:    %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT:    %3:low32_addr_access_rbp = IMPLICIT_DEF
-# ALL-NEXT:    %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit
-# ALL-NEXT:    $eax = COPY %2
-# ALL-NEXT:    RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax,$edx
 
+    ; CHECK-LABEL: name: test_copy6
+    ; CHECK: liveins: $eax, $edx
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit
+    ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edx
     %1(s16) = G_TRUNC %0(s32)
     %2(s32) = G_ANYEXT %1(s16)

From 8208591f274c831e0f72003eb466ee4076c7dbe7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 4 Nov 2025 09:56:59 -0800
Subject: [PATCH 204/313] [RISCV] Use TargetConstant for the immediate
 RISCVISD::SHL_ADD nodes. (#166312)

This is consistent with some other nodes that require a constant.
Particularly intrinsics with ImmArg.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 32 ++++++++++---------
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td |  6 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |  6 ++--
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c56ce3fd2a5a4..b8605629e2dfe 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
           unsigned ShAmount = Log2_64(TrueM1);
           if (Subtarget.hasShlAdd(ShAmount))
             return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV,
-                               DAG.getConstant(ShAmount, DL, VT), CondV);
+                               DAG.getTargetConstant(ShAmount, DL, VT), CondV);
         }
       }
       // (select c, y, 0) -> -c & y
@@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
   SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0);
   SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0);
   SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL,
-                              DAG.getConstant(Diff, DL, VT), NS);
+                              DAG.getTargetConstant(Diff, DL, VT), NS);
   return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT));
 }
 
@@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other,
   int64_t AddConst = AddVal.getSExtValue();
 
   SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0),
-                              DAG.getConstant(ShlConst, DL, VT), Other);
+                              DAG.getTargetConstant(ShlConst, DL, VT), Other);
   return DAG.getNode(ISD::ADD, DL, VT, SHADD,
                      DAG.getSignedConstant(AddConst, DL, VT));
 }
@@ -16501,9 +16501,9 @@ static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
   SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                               DAG.getConstant(ShY, DL, VT), X);
+                               DAG.getTargetConstant(ShY, DL, VT), X);
   return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                     DAG.getConstant(ShX, DL, VT), Mul359);
+                     DAG.getTargetConstant(ShX, DL, VT), Mul359);
 }
 
 static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
@@ -16571,12 +16571,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDValue Shl =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
         return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
-                           DAG.getConstant(ShXAmount, DL, VT), Shl);
+                           DAG.getTargetConstant(ShXAmount, DL, VT), Shl);
       }
       // Otherwise, put the shl second so that it can fold with following
       // instructions (e.g. sext or add).
       SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getConstant(ShXAmount, DL, VT), X);
+                                   DAG.getTargetConstant(ShXAmount, DL, VT), X);
       return DAG.getNode(ISD::SHL, DL, VT, Mul359,
                          DAG.getConstant(Shift, DL, VT));
     }
@@ -16596,7 +16596,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDValue Shift1 =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
         return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                           DAG.getConstant(ScaleShift, DL, VT), Shift1);
+                           DAG.getTargetConstant(ScaleShift, DL, VT), Shift1);
       }
     }
 
@@ -16609,10 +16609,11 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
       assert(Shift != 0 && "MulAmt=4,6,10 handled before");
       if (Shift <= 3) {
         SDLoc DL(N);
-        SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                     DAG.getConstant(ShXAmount, DL, VT), X);
+        SDValue Mul359 =
+            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                        DAG.getTargetConstant(ShXAmount, DL, VT), X);
         return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                           DAG.getConstant(Shift, DL, VT), X);
+                           DAG.getTargetConstant(Shift, DL, VT), X);
       }
     }
 
@@ -16624,9 +16625,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDLoc DL(N);
         SDValue Shift1 =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
-        return DAG.getNode(ISD::ADD, DL, VT, Shift1,
-                           DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                       DAG.getConstant(ScaleShift, DL, VT), X));
+        return DAG.getNode(
+            ISD::ADD, DL, VT, Shift1,
+            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                        DAG.getTargetConstant(ScaleShift, DL, VT), X));
       }
     }
 
@@ -16641,7 +16643,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
         SDValue Mul359 =
             DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
+                        DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X);
         return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
       }
     }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index b37ceaaee9cf4..c2b25c6294019 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -60,6 +60,8 @@ def immfour : RISCVOp {
   let DecoderMethod = "decodeImmFourOperand";
 }
 
+def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction,
 let Predicates = [HasVendorXTHeadBa] in {
 def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)),
           (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
-def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)),
-          (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
+def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>;
 
 // Reuse complex patterns from StdExtZba
 def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 4537bfe8025ca..8376da52be53e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
   let OperandType = "OPERAND_UIMM5_GT3";
 }
 
+def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>;
+
 def UImm5Plus1AsmOperand : AsmOperandClass {
   let Name = "UImm5Plus1";
   let RenderMethod = "addImmOperands";
@@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>;
 def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)),
           (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
-def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)),
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 
 /// Simple arithmetic operations

From 6217f351bb8862a224e8b4434fa6376d803c292f Mon Sep 17 00:00:00 2001
From: actink <actink@163.com>
Date: Wed, 5 Nov 2025 02:03:02 +0800
Subject: [PATCH 205/313] [NFC][AMDGPU] use DAG.UpdateNodeOperands update chain
 (#166396)

---
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c920a046..31eca049fd149 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
-      // TODO: can the chain be replaced without creating a new store?
-      SDValue NewStore = DAG.getTruncStore(
-          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
-          StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
-          StoreNode->getAAInfo());
-      StoreNode = cast<StoreSDNode>(NewStore);
+      SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+      NewOps[0] = NewChain;
+      StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
     }
 
     return scalarizeVectorStore(StoreNode, DAG);

From fb21f16fe6fd1a1fa03662510bde042309ac8ae1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 10:06:29 -0800
Subject: [PATCH 206/313] RuntimeLibcalls: Add stub API for getting function
 signatures (#166290)

Eventually this should be generated by tablegen for all functions.
For now add a manually implementation for sincos_stret, which I
have an immediate use for. This will allow pulling repeated code
across targets into shared call sequence code.

Also add sqrt just to make sure we can handle adding return attributes
on the declaration.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h        |  7 ++
 llvm/lib/IR/RuntimeLibcalls.cpp               | 79 +++++++++++++++++++
 .../Utils/DeclareRuntimeLibcalls.cpp          | 48 ++++++++++-
 .../Util/DeclareRuntimeLibcalls/basic.ll      |  4 +
 .../merge_attributes.ll                       | 11 +++
 .../DeclareRuntimeLibcalls/sincos_stret.ll    | 23 ++++++
 .../wrong_declaration.ll                      | 21 +++++
 7 files changed, 189 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
 create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
 create mode 100644 llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 01359894b0421..ab14ed44fed52 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -186,6 +186,13 @@ struct RuntimeLibcallsInfo {
     return RTLIB::Unsupported;
   }
 
+  /// \returns the function type and attributes for the \p LibcallImpl,
+  /// depending on the target \p TT. If the function has incomplete type
+  /// information, return nullptr for the function type.
+  std::pair<FunctionType *, AttributeList>
+  getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL,
+                RTLIB::LibcallImpl LibcallImpl) const;
+
 private:
   LLVM_ABI static iota_range<RTLIB::LibcallImpl>
   lookupLibcallImplNameImpl(StringRef Name);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 77af29b9d70f6..2ce5719228a0d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringTable.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -72,3 +74,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
     return false;
   }
 }
+
+std::pair<FunctionType *, AttributeList>
+RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
+                                   const DataLayout &DL,
+                                   RTLIB::LibcallImpl LibcallImpl) const {
+  static constexpr Attribute::AttrKind CommonFnAttrs[] = {
+      Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync,
+      Attribute::NoUnwind, Attribute::WillReturn};
+
+  switch (LibcallImpl) {
+  case RTLIB::impl___sincos_stret:
+  case RTLIB::impl___sincosf_stret: {
+    if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected
+      return {};
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret
+                         ? Type::getFloatTy(Ctx)
+                         : Type::getDoubleTy(Ctx);
+
+    AttrBuilder FuncAttrBuilder(Ctx);
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+
+    const bool UseSret =
+        TT.isX86_32() || ((TT.isARM() || TT.isThumb()) &&
+                          ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS);
+
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly(
+        UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    if (UseSret) {
+      AttrBuilder AttrBuilder(Ctx);
+      StructType *StructTy = StructType::get(ScalarTy, ScalarTy);
+      AttrBuilder.addStructRetAttr(StructTy);
+      AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy));
+      FunctionType *FuncTy = FunctionType::get(
+          Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false);
+
+      return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)};
+    }
+
+    Type *RetTy =
+        LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64()
+            ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2))
+            : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy));
+
+    return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs};
+  }
+  case RTLIB::impl_sqrtf:
+  case RTLIB::impl_sqrt: {
+    AttrBuilder FuncAttrBuilder(Ctx);
+
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx)
+                                                      : Type::getDoubleTy(Ctx);
+    FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false);
+
+    Attrs = Attrs.addRetAttribute(
+        Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal |
+                                                  fcNegNormal));
+    return {FuncTy, Attrs};
+  }
+  default:
+    return {};
+  }
+
+  return {};
+}
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51cd2c21..6d4436b92c119 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
 
 using namespace llvm;
 
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+                            const DataLayout &DL, const Triple &TT,
+                            Function *Func, FunctionType *FuncTy,
+                            AttributeList FuncAttrs) {
+  AttributeList OldAttrs = Func->getAttributes();
+  AttributeList NewAttrs = OldAttrs;
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+  }
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+  }
+
+  for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+  }
+
+  Func->setAttributes(NewAttrs);
+}
+
 PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
                                                   ModuleAnalysisManager &MAM) {
   RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
   LLVMContext &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  const Triple &TT = M.getTargetTriple();
 
   for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
     if (Impl == RTLIB::Unsupported)
       continue;
 
-    // TODO: Declare with correct type, calling convention, and attributes.
+    auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
 
-    FunctionType *FuncTy =
-        FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+    // TODO: Declare with correct type, calling convention, and attributes.
+    if (!FuncTy)
+      FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
 
     StringRef FuncName = RTLCI.getLibcallImplName(Impl);
-    M.getOrInsertFunction(FuncName, FuncTy);
+
+    Function *Func =
+        cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+    if (Func->getFunctionType() == FuncTy) {
+      mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+      Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
index ee3a0539bf300..c005316f07f06 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
@@ -11,5 +11,9 @@ define float @sinf(float %x) {
 }
 
 ; CHECK: declare void @acosf(...)
+
+; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
+; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]]
+
 ; CHECK: declare void @__umodti3(...)
 
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
new file mode 100644
index 0000000000000..ffbf11d4106dc
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
@@ -0,0 +1,11 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define noundef nofpclass(nan) float @sqrtf(float %x) "foo" {
+  %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x)
+  ret float %ret
+}
+
+; FIXME: Individual fields of nofpclass not merged
+; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] {
+
+; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" }
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
new file mode 100644
index 0000000000000..0d0e3da25eea7
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s
+
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s
+
+; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
+; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) }
+; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) }
+
+; NONE-NOT: __sincos_stret
+; NONE-NOT: __sincosf_stret
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
new file mode 100644
index 0000000000000..2451010df5b75
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s
+
+; Make sure there is no crash if there are definitions or declarations
+; with the wrong type signature.
+
+; CHECK: define void @sqrtf() {
+define void @sqrtf() {
+  ret void
+}
+
+; CHECK: define float @sqrt(float %0) {
+define float @sqrt(float) {
+  ret float 0.0
+}
+
+; CHECK: declare double @__sincos_stret(double)
+declare double @__sincos_stret(double)
+
+; CHECK: declare { float, float } @__sincosf_stret(float)
+declare { float, float } @__sincosf_stret(float)
+

From 8ee1803538f90e63ee81cc6f51ff294925f83192 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 4 Nov 2025 19:12:41 +0100
Subject: [PATCH 207/313] [CIR] Upstream Builtin ExpOp (#166061)

Upstream the Builtin ExpOp
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 10 +++++++++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       | 11 ++++++++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  8 +++++++
 .../CIR/CodeGen/builtins-floating-point.c     | 21 +++++++++++++++++++
 4 files changed, 50 insertions(+)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index dc56db1bbd4ea..6f9a69e697cc3 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4191,6 +4191,16 @@ def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
   }];
 }
 
+def CIR_ExpOp : CIR_UnaryFPToFPBuiltinOp<"exp", "ExpOp"> {
+  let summary = "Computes the floating-point base-e exponential value";
+  let description = [{
+    `cir.exp` computes the exponential of a floating-point operand and returns
+    a result of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> {
   let summary = "Computes the floating-point absolute value";
   let description = [{
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index d9b9e3b877b50..0803910f2e83a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -222,6 +222,17 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     assert(!cir::MissingFeatures::fastMathFlags());
     return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e);
 
+  case Builtin::BIexp:
+  case Builtin::BIexpf:
+  case Builtin::BIexpl:
+  case Builtin::BI__builtin_exp:
+  case Builtin::BI__builtin_expf:
+  case Builtin::BI__builtin_expf16:
+  case Builtin::BI__builtin_expl:
+  case Builtin::BI__builtin_expf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::ExpOp>(*this, *e);
+
   case Builtin::BIfabs:
   case Builtin::BIfabsf:
   case Builtin::BIfabsl:
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index d94108294a9a3..ba967a43ce59a 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -194,6 +194,14 @@ mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMExpOpLowering::matchAndRewrite(
+    cir::ExpOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ExpOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter,
                                   mlir::Value llvmSrc, mlir::Type llvmDstIntTy,
                                   bool isUnsigned, uint64_t cirSrcWidth,
diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c
index 8bdc43c59dc6f..1b7de650662c7 100644
--- a/clang/test/CIR/CodeGen/builtins-floating-point.c
+++ b/clang/test/CIR/CodeGen/builtins-floating-point.c
@@ -25,3 +25,24 @@ float ceil(float f) {
   // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
   // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
 }
+
+float expf(float f) {
+  return __builtin_expf(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
+}
+
+double exp(double f) {
+  return __builtin_exp(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.double
+  // LLVM: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
+  // OGCG: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
+}
+
+long double expl(long double f) {
+  return __builtin_expl(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.long_double<!cir.f128>
+  // LLVM: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
+  // OGCG: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
+}

From fe106b6e73086f54ca880fee393fcafc45e8209c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 10:19:32 -0800
Subject: [PATCH 208/313] BasicTTI: Cleanup multiple result intrinsic handling
 (#165970)

Avoid weird lambda returning function pointer and sink the libcall
logic to where the operation is handled. This allows chaining the
libcall logic to try sincos_stret and fallback to sincos. The resulting
cost seems too low.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 49 +++++++++++--------
 .../test/Analysis/CostModel/AArch64/sincos.ll | 21 ++++++--
 2 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e8dbc964a943e..221d8f1e2f673 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// (e.g. scalarization).
   std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
       const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
-      RTLIB::Libcall LC,
       std::optional<unsigned> CallRetElementIndex = {}) const {
     Type *RetTy = ICA.getReturnType();
     // Vector variants of the intrinsic can be mapped to a vector library call.
@@ -311,12 +310,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         !isVectorizedStructTy(cast<StructType>(RetTy)))
       return std::nullopt;
 
+    Type *Ty = getContainedTypes(RetTy).front();
+    EVT VT = getTLI()->getValueType(DL, Ty);
+
+    EVT ScalarVT = VT.getScalarType();
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+
+    switch (ICA.getID()) {
+    case Intrinsic::modf:
+      LC = RTLIB::getMODF(ScalarVT);
+      break;
+    case Intrinsic::sincospi:
+      LC = RTLIB::getSINCOSPI(ScalarVT);
+      break;
+    case Intrinsic::sincos:
+      LC = RTLIB::getSINCOS(ScalarVT);
+      break;
+    default:
+      return std::nullopt;
+    }
+
     // Find associated libcall.
-    const char *LCName = getTLI()->getLibcallName(LC);
-    if (!LCName)
+    RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC);
+    if (LibcallImpl == RTLIB::Unsupported)
       return std::nullopt;
 
+    StringRef LCName =
+        RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpl);
+
     // Search for a corresponding vector variant.
+    //
+    // FIXME: Should use RuntimeLibcallsInfo, not TargetLibraryInfo to get the
+    // vector mapping.
     LLVMContext &Ctx = RetTy->getContext();
     ElementCount VF = getVectorizedTypeVF(RetTy);
     VecDesc const *VD = nullptr;
@@ -2137,22 +2162,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::modf:
     case Intrinsic::sincos:
     case Intrinsic::sincospi: {
-      Type *Ty = getContainedTypes(RetTy).front();
-      EVT VT = getTLI()->getValueType(DL, Ty);
-
-      RTLIB::Libcall LC = [&] {
-        switch (ICA.getID()) {
-        case Intrinsic::modf:
-          return RTLIB::getMODF;
-        case Intrinsic::sincos:
-          return RTLIB::getSINCOS;
-        case Intrinsic::sincospi:
-          return RTLIB::getSINCOSPI;
-        default:
-          llvm_unreachable("unexpected intrinsic");
-        }
-      }()(VT.getScalarType());
-
       std::optional<unsigned> CallRetElementIndex;
       // The first element of the modf result is returned by value in the
       // libcall.
@@ -2160,7 +2169,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         CallRetElementIndex = 0;
 
       if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost(
-              ICA, CostKind, LC, CallRetElementIndex))
+              ICA, CostKind, CallRetElementIndex))
         return *Cost;
       // Otherwise, fallback to default scalarization cost.
       break;
diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
index 32408acb582d0..72c8f2bbbf8cf 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
+; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s
 
 define void @sincos() {
 ; CHECK-LABEL: 'sincos'
@@ -8,13 +9,11 @@ define void @sincos() {
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
 ; CHECK:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
@@ -26,18 +25,32 @@ define void @sincos() {
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
+; SINCOS_STRET-LABEL: 'sincos'
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
 ;
   %f16 = call { half, half } @llvm.sincos.f16(half poison)
   %f32 = call { float, float } @llvm.sincos.f32(float poison)

From 831e79adff4506a0b22a770dcaa46bf5a37257cb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 10:20:00 -0800
Subject: [PATCH 209/313] DAG: Merge all sincos_stret emission code into
 legalizer (#166295)

This avoids AArch64 legality rules depending on libcall
availability.

ARM, AArch64, and X86 all had custom lowering of fsincos which
all were just to emit calls to sincos_stret / sincosf_stret. This
messes with the cost heuristics around legality, because really
it's an expand/libcall cost and not a favorable custom.

This is a bit ugly, because we're emitting code trying to match the
C ABI lowered IR type for the aggregate return type. This now also
gives an easy way to lift the unhandled x86_32 darwin case, since
ARM already handled the return as sret case.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 109 ++++++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  43 +------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   1 -
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  78 +------------
 llvm/lib/Target/ARM/ARMISelLowering.h         |   1 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  60 +---------
 6 files changed, 118 insertions(+), 174 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 431a81002074f..316aacdf6978e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -163,6 +163,8 @@ class SelectionDAGLegalize {
                                    RTLIB::Libcall CallI128);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
+  SDValue ExpandSincosStretLibCall(SDNode *Node) const;
+
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl);
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
+SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Node);
+  SDValue Arg = Node->getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  /// There are 3 different ABI cases to handle:
+  /// - Direct return of separate fields in registers
+  /// - Single return as vector elements
+  /// - sret struct
+
+  const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
+
+  const DataLayout &DL = DAG.getDataLayout();
+
+  auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
+      *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
+
+  Type *SincosStretRetTy = FuncTy->getReturnType();
+  CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
+  StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
+
+  SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
+                                         TLI.getProgramPointerTy(DL));
+
+  TargetLowering::ArgListTy Args;
+  SDValue SRet;
+
+  int FrameIdx;
+  if (FuncTy->getParamType(0)->isPointerTy()) {
+    // Uses sret
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+    AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
+    Type *StructTy = PtrAttrs.getStructRetType();
+    const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
+    const Align StackAlign = DL.getPrefTypeAlign(StructTy);
+
+    FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
+
+    TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
+    Entry.IsSRet = true;
+    Entry.IndirectType = StructTy;
+    Entry.Alignment = StackAlign;
+
+    Args.push_back(Entry);
+    Args.emplace_back(Arg, FuncTy->getParamType(1));
+  } else {
+    Args.emplace_back(Arg, FuncTy->getParamType(0));
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
+      .setIsPostTypeLegalization();
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (SRet) {
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+    SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
+
+    TypeSize StoreSize = ArgVT.getStoreSize();
+
+    // Address of cos field.
+    SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
+    SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                  PtrInfo.getWithOffset(StoreSize));
+
+    SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
+                       LoadCos.getValue(0));
+  }
+
+  if (!CallResult.first.getValueType().isVector())
+    return CallResult.first;
+
+  SDValue SinVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(0, dl));
+  SDValue CosVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(1, dl));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
@@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::FSINCOS:
   case ISD::FSINCOSPI: {
     EVT VT = Node->getValueType(0);
+
+    if (Node->getOpcode() == ISD::FSINCOS) {
+      RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
+      if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
+        if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
+          Results.push_back(Expanded);
+          Results.push_back(Expanded.getValue(1));
+          break;
+        }
+      }
+    }
+
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
                             ? RTLIB::getSINCOS(VT)
                             : RTLIB::getSINCOSPI(VT);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..d08f9b94227a2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    // Issue __sincos_stret if available.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
+  // Issue __sincos_stret if available.
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // Make floating-point constants legal for the large code model, so they don't
   // become loads from the constant pool.
@@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc DL(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
-                                        : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  CLI.setDebugLoc(DL)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CC, RetTy, Callee, std::move(Args));
-
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
 static MVT getSVEContainerType(EVT ContentTy);
 
 SDValue
@@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     return LowerFP_TO_INT_SAT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
   case ISD::GET_ROUNDING:
     return LowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed29f252a..70bfae717fb76 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6b0653457cbaf..92fae71121a81 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
 }
 
-SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // return values are passed via sret.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
-  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
-  if (SincosStret == RTLIB::Unsupported)
-    return SDValue();
-
-  assert(Subtarget->isTargetDarwin());
-
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
-  // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy);
-  auto &DL = DAG.getDataLayout();
-
-  ArgListTy Args;
-  bool ShouldUseSRet = getTM().isAPCS_ABI();
-  SDValue SRet;
-  if (ShouldUseSRet) {
-    // Create stack object for sret.
-    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
-    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
-
-    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Entry.IsSRet = true;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(*DAG.getContext());
-  }
-
-  Args.emplace_back(Arg, ArgTy);
-
-  StringRef LibcallName = getLibcallImplName(SincosStret);
-  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args))
-      .setDiscardResult(ShouldUseSRet);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
-  if (!ShouldUseSRet)
-    return CallResult.first;
-
-  SDValue LoadSin =
-      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
-
-  // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
-                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos =
-      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
-
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
-                     LoadSin.getValue(0), LoadCos.getValue(0));
-}
-
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
                                                   bool Signed,
                                                   SDValue &Chain) const {
@@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_SMAX:
     return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
-  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    return LowerAtomicLoadStore(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bf3438b0d8803..bc2fec3c1bdb5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -901,7 +901,6 @@ class VectorType;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6edf0185df813..da2556978b39d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -33004,61 +33004,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
-                            SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  bool isF64 = ArgVT == MVT::f64;
-
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
-  if (!LibcallName)
-    return SDValue();
-
-  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
-  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values as { float, float } (in XMM0) or
-  // { double, double } (which is returned in XMM0, XMM1).
-  SDLoc dl(Op);
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
-  // the small struct {f32, f32} is returned in (eax, edx). For f64,
-  // the results are returned via SRet in memory.
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
-  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)FixedVectorType::get(ArgTy, 2);
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
-      .setIsPostTypeLegalization();
-
-  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
-  if (isF64)
-    // Returned in xmm0 and xmm1.
-    return CallResult.first;
-
-  // Returned in bits 0:31 and 32:64 xmm0.
-  SDValue SinVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(0, dl));
-  SDValue CosVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(1, dl));
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
 /// Widen a vector input to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33663,7 +33608,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ABDS:
   case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);

From 2e8543c73555e26bba1c2dffab18288f33916b5d Mon Sep 17 00:00:00 2001
From: Sp00ph <61327188+Sp00ph@users.noreply.github.com>
Date: Tue, 4 Nov 2025 19:32:55 +0100
Subject: [PATCH 210/313] [X86] Improve variable 8-bit shifts on AVX512BW
 (#164136)

Previously, `clang -march=znver5 -O3` would emit the following for
`shl`, `lshr` and `ashr <64 x i8>`:
```asm
.LCPI0_2:
	.byte	8
	.byte	4
	.byte	2
	.byte	1
	.byte	0
	.byte	0
	.byte	0
	.byte	0
.LCPI0_3:
	.byte	32
	.byte	16
	.byte	8
	.byte	4
	.byte	2
	.byte	1
	.byte	0
	.byte	0
shl:
	vpsllw	zmm1, zmm1, 5
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_2]{1to8}, 0
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI0_3]{1to8}, 0
	vpmovb2m	k1, zmm1
	vpaddb	zmm0 {k1}, zmm0, zmm0
	ret

.LCPI1_3:
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	128
	.byte	64
	.byte	32
	.byte	16
.LCPI1_4:
	.byte	0
	.byte	0
	.byte	128
	.byte	64
	.byte	32
	.byte	16
	.byte	8
	.byte	4
.LCPI1_5:
	.byte	0
	.byte	128
	.byte	64
	.byte	32
	.byte	16
	.byte	8
	.byte	4
	.byte	2
lshr:
	vpsllw	zmm1, zmm1, 5
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_3]{1to8}, 0
	vpmovb2m	k1, zmm1
	vpaddb	zmm1, zmm1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_4]{1to8}, 0
	vpmovb2m	k1, zmm1
	vgf2p8affineqb	zmm0 {k1}, zmm0, qword ptr [rip + .LCPI1_5]{1to8}, 0
	ret

ashr:
	vpsllw	zmm1, zmm1, 5
	vpunpckhbw	zmm2, zmm0, zmm0
	vpunpckhbw	zmm4, zmm1, zmm1
	vpsraw	zmm3, zmm2, 4
	vpunpcklbw	zmm0, zmm0, zmm0
	vpmovb2m	k1, zmm4
	vpaddw	zmm4, zmm4, zmm4
	vpunpcklbw	zmm1, zmm1, zmm1
	vmovdqu8	zmm2 {k1}, zmm3
	vpmovb2m	k1, zmm4
	vpsraw	zmm3, zmm2, 2
	vpaddw	zmm4, zmm4, zmm4
	vmovdqu8	zmm2 {k1}, zmm3
	vpsraw	zmm3, zmm2, 1
	vpmovb2m	k1, zmm4
	vmovdqu8	zmm2 {k1}, zmm3
	vpmovb2m	k1, zmm1
	vpsraw	zmm3, zmm0, 4
	vpaddw	zmm1, zmm1, zmm1
	vpsrlw	zmm2, zmm2, 8
	vmovdqu8	zmm0 {k1}, zmm3
	vpmovb2m	k1, zmm1
	vpsraw	zmm3, zmm0, 2
	vpaddw	zmm1, zmm1, zmm1
	vmovdqu8	zmm0 {k1}, zmm3
	vpsraw	zmm3, zmm0, 1
	vpmovb2m	k1, zmm1
	vmovdqu8	zmm0 {k1}, zmm3
	vpsrlw	zmm0, zmm0, 8
	vpackuswb	zmm0, zmm0, zmm2
	ret
```

With this commit, the generated assembly becomes this:

```asm
.LCPI0_2:
	.byte	0
	.byte	255
	.byte	0
	.byte	255
.LCPI0_3:
	.byte	255
	.byte	0
	.byte	255
	.byte	0
shl:
	vpsrlw	zmm2, zmm1, 8
	vpandd	zmm3, zmm0, dword ptr [rip + .LCPI0_2]{1to16}
	vpandd	zmm1, zmm1, dword ptr [rip + .LCPI0_3]{1to16}
	movabs	rax, -6148914691236517206
	kmovq	k1, rax
	vpsllvw	zmm2, zmm3, zmm2
	vpsllvw	zmm0, zmm0, zmm1
	vmovdqu8	zmm0 {k1}, zmm2
	ret

.LCPI1_0:
	.byte	255
	.byte	0
lshr:
	vpbroadcastw	zmm2, word ptr [rip + .LCPI1_0]
	movabs	rax, -6148914691236517206
	kmovq	k1, rax
	vpandq	zmm3, zmm1, zmm2
	vpandq	zmm2, zmm0, zmm2
	vpsrlw	zmm1, zmm1, 8
	vpsrlvw	zmm2, zmm2, zmm3
	vpsrlvw	zmm0, zmm0, zmm1
	vmovdqu8	zmm2 {k1}, zmm0
	vmovdqa64	zmm0, zmm2
	ret

.LCPI2_1:
	.byte	255
	.byte	0
	.byte	255
	.byte	0
ashr:
	vpsrlw	zmm2, zmm1, 8
	vpandd	zmm1, zmm1, dword ptr [rip + .LCPI2_1]{1to16}
	movabs	rax, -6148914691236517206
	vpsravw	zmm2, zmm0, zmm2
	vpsllw	zmm0, zmm0, 8
	kmovq	k1, rax
	vpsraw	zmm0, zmm0, 8
	vpsravw	zmm0, zmm0, zmm1
	vmovdqu8	zmm0 {k1}, zmm2
	ret
```

While I don't have AVX512 hardware, llvm-mca suggests significant
speedups, and I've done some simple correctness tests on random inputs
using the Intel Software Development Emulator.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 57 ++++++++++++++
 llvm/test/CodeGen/X86/gfni-shifts.ll          | 75 +++++++------------
 .../test/CodeGen/X86/vector-shift-ashr-512.ll | 40 +++-------
 .../test/CodeGen/X86/vector-shift-lshr-512.ll | 22 ++----
 llvm/test/CodeGen/X86/vector-shift-shl-512.ll | 19 ++---
 5 files changed, 109 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index da2556978b39d..06b8f7614bffd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
   }
 
+  if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+    // On AVX512BW, we can use variable 16-bit shifts to implement variable
+    // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+    // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+    // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+    // can efficiently be merged together using a masked move.
+    MVT ExtVT = MVT::v32i16;
+
+    SDValue RLo, RHi;
+    // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
+    // right shifting AmtHi.
+    SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+                                DAG.getConstant(0x00ff, dl, ExtVT));
+    SDValue AmtHi = getTargetVShiftByConstNode(
+        X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
+    switch (Opc) {
+    case ISD::SHL:
+      // Because we shift left, no bits from the high half can influence the low
+      // half, so we don't need to mask RLo. We do however need to mask RHi, to
+      // prevent high bits of an even lane overflowing into low bits of an odd
+      // lane.
+      RLo = DAG.getBitcast(ExtVT, R);
+      RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+                        DAG.getConstant(0xff00, dl, ExtVT));
+      break;
+    case ISD::SRL:
+      // Same idea as above, but this time we need to make sure no low bits of
+      // an odd lane can overflow into high bits of an even lane.
+      RHi = DAG.getBitcast(ExtVT, R);
+      RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+                        DAG.getConstant(0x00ff, dl, ExtVT));
+      break;
+    case ISD::SRA:
+      // For arithmetic right shifts, we want to sign extend each even lane of R
+      // such that the upper half of the corresponding lane of RLo is 0 or -1
+      // depending on the sign bit of the original lane. We do this using 2
+      // immediate shifts.
+      RHi = DAG.getBitcast(ExtVT, R);
+      RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+      RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
+      break;
+    default:
+      llvm_unreachable("Unexpected Shift Op");
+    }
+
+    SDValue ShiftedLo =
+        DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
+    SDValue ShiftedHi =
+        DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
+
+    // To merge the shifted vectors back together, we select even lanes
+    // from ShiftedLo and odd lanes from ShiftedHi.
+    SDValue SelectMask = DAG.getBitcast(
+        MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+    return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
+  }
+
   if (VT == MVT::v16i8 ||
       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dcad243a..30f1874c51fed 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_shl_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_lshr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_ashr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; GFNIAVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420bb2609..aff2228c258b5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d5702fb93a..4450d07e01cca 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd742956ed09..41238acc4b74d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift

From 290ff955f07f44b5a9e0a03d405c60f794eb2e98 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 4 Nov 2025 18:34:13 +0000
Subject: [PATCH 211/313] [VPlan] Verify incoming values of VPIRPhi matches
 before checking (NFC)

Update the verifier to first check if the number of incoming values
matches the number of predecessors, before using
incoming_values_and_blocks. We unfortunately need also check here, as
this may be called before verifyPhiRecipes runs.

Also update the verifier unit tests, to actually fail for the expected
recipes.
---
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  7 +++
 .../Vectorize/VPlanVerifierTest.cpp           | 51 ++++++++++++++++---
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a10cb2c8..34754a1ea3992 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
 
       for (const VPUser *U : V->users()) {
         auto *UI = cast<VPRecipeBase>(U);
+        if (isa<VPIRPhi>(UI) &&
+            UI->getNumOperands() != UI->getParent()->getNumPredecessors()) {
+          errs() << "Phi-like recipe with different number of operands and "
+                    "predecessors.\n";
+          return false;
+        }
+
         if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
           for (const auto &[IncomingVPV, IncomingVPBB] :
                Phi->incoming_values_and_blocks()) {
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 46802826fe090..169114ed6c310 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -326,22 +326,18 @@ TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
 
 class VPIRVerifierTest : public VPlanTestIRBase {};
 
-TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
+TEST_F(VPIRVerifierTest, testVerifyIRPhiInScalarHeaderVPIRBB) {
   const char *ModuleString =
       "define void @f(ptr %A, i64 %N) {\n"
       "entry:\n"
       "  br label %loop\n"
       "loop:\n"
       "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n"
-      "  %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n"
-      "  %l1 = load i32, ptr %arr.idx, align 4\n"
-      "  %res = add i32 %l1, 10\n"
-      "  store i32 %res, ptr %arr.idx, align 4\n"
       "  %iv.next = add i64 %iv, 1\n"
       "  %exitcond = icmp ne i64 %iv.next, %N\n"
       "  br i1 %exitcond, label %loop, label %for.end\n"
       "for.end:\n"
-      "  %p = phi i32 [ %l1, %loop ]\n"
+      "  %p = phi i64 [ %iv, %loop ]\n"
       "  ret void\n"
       "}\n";
 
@@ -351,7 +347,48 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader);
 
-  Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0));
+#if GTEST_HAS_STREAM_REDIRECTION
+  ::testing::internal::CaptureStderr();
+#endif
+  EXPECT_FALSE(verifyVPlanIsValid(*Plan));
+#if GTEST_HAS_STREAM_REDIRECTION
+  EXPECT_STREQ(
+      "Phi-like recipe with different number of operands and predecessors.\n",
+      ::testing::internal::GetCapturedStderr().c_str());
+#endif
+}
+
+TEST_F(VPIRVerifierTest, testVerifyIRPhiInExitVPIRBB) {
+  const char *ModuleString =
+      "define void @f(ptr %A, i64 %N) {\n"
+      "entry:\n"
+      "  br label %loop\n"
+      "loop:\n"
+      "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n"
+      "  %iv.next = add i64 %iv, 1\n"
+      "  %exitcond = icmp ne i64 %iv.next, %N\n"
+      "  br i1 %exitcond, label %loop, label %for.end\n"
+      "for.end:\n"
+      "  %p = phi i64 [ %iv, %loop ]\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("f");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildVPlan(LoopHeader);
+
+  // Create a definition in the vector loop header that will be used by the phi.
+  auto *HeaderBlock =
+      cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getEntry());
+  VPInstruction *DefI =
+      new VPInstruction(VPInstruction::ExtractLastElement,
+                        {HeaderBlock->front().getVPSingleValue()});
+  DefI->insertBefore(Plan->getMiddleBlock()->getTerminator());
+  Plan->getExitBlocks()[0]->front().addOperand(DefI);
+  VPValue *Zero = Plan->getConstantInt(32, 0);
+  Plan->getScalarHeader()->front().addOperand(Zero);
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();

From e29ee270615f3c8e3c9a0a657c0be43f9d76f441 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Tue, 4 Nov 2025 10:40:46 -0800
Subject: [PATCH 212/313] [lld][WebAssembly] Allow `--no-stack-first` in
 addition to `--stack-first` (#166384)

This paves the way to make `--stack-first` the default.

See: #151015
---
 lld/test/wasm/stack-first.test | 25 +++++++++++++++++++++++++
 lld/wasm/Driver.cpp            |  2 +-
 lld/wasm/Options.td            |  5 +++--
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test
index 72e1a006d5700..46e8c6ebc2381 100644
--- a/lld/test/wasm/stack-first.test
+++ b/lld/test/wasm/stack-first.test
@@ -8,6 +8,15 @@ RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/stack-first.
 RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
 RUN: obj2yaml %t.wasm | FileCheck %s
 
+; Check `--no-stack-first`
+RUN: wasm-ld -z stack-size=512 --stack-first --no-stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST
+
+; Check that the default is no-stack-first
+RUN: wasm-ld -z stack-size=512 --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST
+
+
 CHECK:        - Type:            GLOBAL
 CHECK-NEXT:     Globals:         
 CHECK-NEXT:       - Index:           0
@@ -51,3 +60,19 @@ CHECK-NEXT:         Index:           2
 CHECK-NEXT:       - Name:            __heap_base
 CHECK-NEXT:         Kind:            GLOBAL
 CHECK-NEXT:         Index:           3
+
+NOT-FIRST:        - Type:            GLOBAL
+NOT-FIRST-NEXT:     Globals:
+NOT-FIRST-NEXT:       - Index:           0
+NOT-FIRST-NEXT:         Type:            I32
+NOT-FIRST-NEXT:         Mutable:         true
+NOT-FIRST-NEXT:         InitExpr:
+NOT-FIRST-NEXT:           Opcode:          I32_CONST
+NOT-FIRST-NEXT:           Value:           1552
+NOT-FIRST-NEXT:       - Index:           1
+NOT-FIRST-NEXT:         Type:            I32
+NOT-FIRST-NEXT:         Mutable:         false
+NOT-FIRST-NEXT:         InitExpr:
+NOT-FIRST-NEXT:           Opcode:          I32_CONST
+NOT-FIRST-NEXT:           Value:           1024
+
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 9c0e1b58e62f9..aad8095881bba 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -595,7 +595,7 @@ static void readConfigs(opt::InputArgList &args) {
   ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck);
   ctx.arg.stripAll = args.hasArg(OPT_strip_all);
   ctx.arg.stripDebug = args.hasArg(OPT_strip_debug);
-  ctx.arg.stackFirst = args.hasArg(OPT_stack_first);
+  ctx.arg.stackFirst = args.hasFlag(OPT_stack_first, OPT_no_stack_first, false);
   ctx.arg.trace = args.hasArg(OPT_trace);
   ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir);
   ctx.arg.thinLTOCachePolicy = CHECK(
diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td
index 2f699e2f68350..0ed69258d99aa 100644
--- a/lld/wasm/Options.td
+++ b/lld/wasm/Options.td
@@ -250,8 +250,9 @@ def no_entry: FF<"no-entry">,
 def no_shlib_sigcheck: FF<"no-shlib-sigcheck">,
   HelpText<"Do not check signatures of functions defined in shared libraries.">;
 
-def stack_first: FF<"stack-first">,
-  HelpText<"Place stack at start of linear memory rather than after data">;
+defm stack_first: B<"stack-first",
+    "Place stack at start of linear memory",
+    "Place the stack after static data refion (default)">;
 
 def table_base: JJ<"table-base=">,
   HelpText<"Table offset at which to place address taken functions (Defaults to 1)">;

From 2bc22ea02edda5926f3e53f141def9bf212ac1db Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 4 Nov 2025 11:00:35 -0800
Subject: [PATCH 213/313] [NFC][SLU] Update SimpleLoopUnswitch/guards.ll
 (#166285)

When running UTC on SLU/guards.ll (without LLVM changes), there are a number of changes in the UTC-generated checks. Submitting those first to simplify the diff of PR #164271, as most of the changes in the latter were actually these.
---
 .../Transforms/SimpleLoopUnswitch/guards.ll   | 301 +++++++++++-------
 1 file changed, 190 insertions(+), 111 deletions(-)

diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 533b1f691f5ad..706b49df14749 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards  -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s
@@ -6,21 +6,30 @@
 declare void @llvm.experimental.guard(i1, ...)
 
 define void @test_simple_case(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_simple_case(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
-; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
-; CHECK:       guarded.us:
+; CHECK-LABEL: define void @test_simple_case(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
-; CHECK:       deopt:
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -38,29 +47,44 @@ exit:
 }
 
 define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_two_guards(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK:       entry.split.us.split.us:
-; CHECK-NEXT:    br label [[LOOP_US_US:%.*]]
-; CHECK:       loop.us.us:
-; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US_US:%.*]]
-; CHECK:       guarded.us.us:
-; CHECK-NEXT:    br label [[GUARDED_US2]]
-; CHECK:       guarded.us2:
+; CHECK-LABEL: define void @test_two_guards(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_US_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
+; CHECK:       [[LOOP_US_US]]:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US2:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US_US:.*]]
+; CHECK:       [[GUARDED_US_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US2]]
+; CHECK:       [[GUARDED_US2]]:
 ; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]]
-; CHECK:       deopt1:
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[ENTRY_SPLIT_US_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US:.*]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    br label %[[DEOPT1:.*]]
+; CHECK:       [[DEOPT1]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       deopt:
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       exit:
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 
@@ -80,35 +104,46 @@ exit:
 }
 
 define void @test_conditional_guards(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_conditional_guards(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FROZEN:%.+]] = freeze i1 [[COND:%.*]]
-; CHECK-NEXT:    br i1 [[FROZEN]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
-; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ]
+; CHECK-LABEL: define void @test_conditional_guards(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[BACKEDGE_US:.*]] ]
 ; CHECK-NEXT:    [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123
-; CHECK-NEXT:    br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]]
-; CHECK:       guard.us:
-; CHECK-NEXT:    br label [[GUARDED_US:%.*]]
-; CHECK:       backedge.us:
+; CHECK-NEXT:    br i1 [[CONDITION_US]], label %[[GUARD_US:.*]], label %[[BACKEDGE_US]]
+; CHECK:       [[GUARD_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US:.*]]
+; CHECK:       [[BACKEDGE_US]]:
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    br label %[[BACKEDGE_US]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
 ; CHECK-NEXT:    [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123
-; CHECK-NEXT:    br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]]
-; CHECK:       guard:
-; CHECK-NEXT:    br label [[DEOPT:%.*]]
-; CHECK:       deopt:
+; CHECK-NEXT:    br i1 [[CONDITION]], label %[[GUARD:.*]], label %[[BACKEDGE]]
+; CHECK:       [[GUARD]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       backedge:
+; CHECK:       [[BACKEDGE]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_SPLIT:.*]]
+; CHECK:       [[EXIT_SPLIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -133,53 +168,54 @@ exit:
 }
 
 define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
-; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %cond, label %entry.split, label %outer_loop.split
-; CHECK:       entry.split:
-; CHECK-NEXT:    br i1 %arg, label %entry.split.split.us, label %entry.split.split
-; CHECK:       entry.split.split.us:
-; CHECK-NEXT:    br label %outer_loop.us
-; CHECK:       outer_loop.us:
-; CHECK-NEXT:    br label %outer_loop.split.us.us
-; CHECK:       outer_backedge.us:
-; CHECK-NEXT:    br label %outer_loop.us
-; CHECK:       outer_loop.split.us.us:
-; CHECK-NEXT:    br label %loop.us.us
-; CHECK:       loop.us.us:
-; CHECK-NEXT:    %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ]
-; CHECK-NEXT:    br label %guarded.us.us
-; CHECK:       guarded.us.us:
-; CHECK-NEXT:    %iv.next.us.us = add i32 %iv.us.us, 1
-; CHECK-NEXT:    %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N
-; CHECK-NEXT:    br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us
-; CHECK:       outer_backedge.split.us.us:
-; CHECK-NEXT:    br label %outer_backedge.us
-; CHECK:       entry.split.split:
-; CHECK-NEXT:    br label %outer_loop
-; CHECK:       outer_loop:
-; CHECK-NEXT:    br label %outer_loop.split.us
-; CHECK:       outer_loop.split.us:
-; CHECK-NEXT:    br label %loop.us
-; CHECK:       loop.us:
-; CHECK-NEXT:    %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ]
-; CHECK-NEXT:    br label %guarded.us
-; CHECK:       guarded.us:
-; CHECK-NEXT:    %iv.next.us = add i32 %iv.us, 1
-; CHECK-NEXT:    %loop.cond.us = icmp slt i32 %iv.next.us, %N
-; CHECK-NEXT:    br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us
-; CHECK:       outer_backedge.split.us:
-; CHECK-NEXT:    br label %outer_backedge
-; CHECK:       outer_loop.split:
-; CHECK-NEXT:    br label %loop
-; CHECK:       loop:
-; CHECK-NEXT:    br label %deopt
-; CHECK:       deopt:
+; CHECK-LABEL: define void @test_nested_loop(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_US:.*]]
+; CHECK:       [[OUTER_LOOP_US]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_SPLIT_US_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_US:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_US]]
+; CHECK:       [[OUTER_LOOP_SPLIT_US_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
+; CHECK:       [[LOOP_US_US]]:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US_US]]
+; CHECK:       [[GUARDED_US_US]]:
+; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[OUTER_BACKEDGE_SPLIT_US_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_SPLIT_US_US]]:
+; CHECK-NEXT:    br label %[[OUTER_BACKEDGE_US]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
+; CHECK:       [[OUTER_LOOP]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_SPLIT_US:.*]]
+; CHECK:       [[OUTER_LOOP_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[OUTER_BACKEDGE_SPLIT_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[OUTER_BACKEDGE:.*]]
+; CHECK:       [[OUTER_LOOP_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       outer_backedge:
-; CHECK-NEXT:    br label %exit
-; CHECK:       exit:
+; CHECK:       [[OUTER_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 
@@ -204,17 +240,50 @@ exit:
 }
 
 define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_sibling_loops(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:         [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
-; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-LABEL: define void @test_sibling_loops(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP1_US:.*]]
+; CHECK:       [[LOOP1_US]]:
+; CHECK-NEXT:    [[IV1_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    [[IV1_NEXT_US]] = add i32 [[IV1_US]], 1
+; CHECK-NEXT:    [[LOOP1_COND_US:%.*]] = icmp slt i32 [[IV1_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP1_COND_US]], label %[[LOOP1_US]], label %[[BETWEEN_SPLIT_US:.*]]
+; CHECK:       [[BETWEEN_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[BETWEEN:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP1:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:         [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US2]]
-; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK:       [[BETWEEN]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]]
+; CHECK:       [[BETWEEN_SPLIT_US2]]:
+; CHECK-NEXT:    br label %[[LOOP2_US:.*]]
+; CHECK:       [[LOOP2_US]]:
+; CHECK-NEXT:    [[IV2_US:%.*]] = phi i32 [ 0, %[[BETWEEN_SPLIT_US2]] ], [ [[IV2_NEXT_US:%.*]], %[[GUARDED_US3:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US3]]
+; CHECK:       [[GUARDED_US3]]:
+; CHECK-NEXT:    [[IV2_NEXT_US]] = add i32 [[IV2_US]], 1
+; CHECK-NEXT:    [[LOOP2_COND_US:%.*]] = icmp slt i32 [[IV2_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP2_COND_US]], label %[[LOOP2_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BETWEEN_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    br label %[[DEOPT1:.*]]
+; CHECK:       [[DEOPT1]]:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -242,11 +311,21 @@ exit:
 }
 
 ; Check that we don't do anything because of cleanuppad.
-; CHECK-LABEL: @test_cleanuppad(
-; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
-; CHECK-NOT:   call void (i1, ...) @llvm.experimental.guard(
 define void @test_cleanuppad(i1 %cond, i32 %N) personality ptr @__CxxFrameHandler3 {
-
+; CHECK-LABEL: define void @test_cleanuppad(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) personality ptr @__CxxFrameHandler3 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    invoke void @may_throw(i32 [[IV]])
+; CHECK-NEXT:            to label %[[LOOP]] unwind label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[CP:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    cleanupret from [[CP]] unwind to caller
+;
 entry:
   br label %loop
 

From 7398591148f4351b38404304d8e2acb80651aaf3 Mon Sep 17 00:00:00 2001
From: Grigory Pastukhov <99913765+grigorypas@users.noreply.github.com>
Date: Tue, 4 Nov 2025 11:01:50 -0800
Subject: [PATCH 214/313] [CodeGen] Add skipFunction() check to
 MachineFunctionSplitter (#166260)

MachineFunctionSplitter was missing a skipFunction() check, causing it
to incorrectly split functions that should be skipped (e.g., functions
with optnone attribute).

This patch adds an early skipFunction() check in runOnMachineFunction()
to ensure these functions are never split, regardless of profile data
availability or other splitting conditions.
---
 llvm/lib/CodeGen/MachineFunctionSplitter.cpp  |  3 ++
 .../machine-function-splitter-optnone.ll      | 50 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll

diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index c31454a8affda..b5d3092ee84d8 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
 }
 
 bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
   // Do not split functions when -basic-block-sections=all is specified.
   if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All)
     return false;
diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
new file mode 100644
index 0000000000000..67d2ad72ee2f4
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
@@ -0,0 +1,50 @@
+; REQUIRES: x86-registered-target
+
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s
+
+;; Check that functions with optnone attribute are not split.
+; CHECK-LABEL: foo_optnone:
+; CHECK-NOT:   .section .text.split.foo_optnone
+; CHECK-NOT:   foo_optnone.cold:
+; CHECK:       .LBB0_2:
+; CHECK:       .size   foo_optnone
+
+define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 {
+entry:
+  br i1 %0, label %hot, label %cold, !prof !17
+
+hot:
+  %1 = call i32 @bar()
+  br label %exit
+
+cold:
+  %2 = call i32 @baz()
+  br label %exit
+
+exit:
+  %3 = tail call i32 @qux()
+  ret void
+}
+
+declare i32 @bar()
+declare i32 @baz()
+declare i32 @qux()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999900, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 7000}
+!15 = !{!"function_section_prefix", !"hot"}
+!17 = !{!"branch_weights", i32 7000, i32 0}

From fe5c3cbbd02f201f96e2f55aebe5e0a1032c1594 Mon Sep 17 00:00:00 2001
From: yasmincs <ysarita@nvidia.com>
Date: Tue, 4 Nov 2025 11:02:49 -0800
Subject: [PATCH 215/313] Added Conditions of SM90 and ISA7.8 for Using
 cvt.ftz.f32.bf16 Instruction (#165774)

Updated the conditions for generating the cvt.ftz.f32.bf16 instruction
to include sm90 and isa7.8, so that ftz is only generated when it is
supported.

---------

Co-authored-by: Justin Fargnoli <jfargnoli@nvidia.com>
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td      |   2 +-
 llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 343 +++++++++++++++++--
 2 files changed, 311 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b26022184708c..f0bdf472b96ed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
 def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
 // fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>;
 def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
 
 // fpextend f16 -> f64
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd9e57c0..3626613cf8511 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
 ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fadd(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT:    sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fsub(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT:    add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_faddx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT:    sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fsubx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT:    mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fmulx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT:    div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT:    div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fdiv(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fpext_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptrunc_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r2, %r1, 0f3F800000;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT:    mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fadd_imm_1(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r12, %rs1;
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<13>;
+; SM90-FTZ-NEXT:    .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_extload_bf16x8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<9>;
@@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rzi.ftz.s16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptosi_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rzi.ftz.u16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptoui_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_sitofp_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .pred %p<2>;
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT:    and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i1(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<2>;
@@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i32(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i64(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_roundeven(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
+; SM90-FTZ-NEXT:    cvt.rni.bf16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_roundeven(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maximum(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
+; SM90-FTZ-NEXT:    max.NaN.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maximum(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maxnum(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_maxnum_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_maxnum_param_1];
+; SM90-FTZ-NEXT:    max.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maxnum(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maximum_v2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_1];
+; SM90-FTZ-NEXT:    max.NaN.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maximum_v2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maxnum_v2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_1];
+; SM90-FTZ-NEXT:    max.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maxnum_v2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;

From 92a1eb37122fa24e3045fbabdea2bf87127cace5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 11:07:57 -0800
Subject: [PATCH 216/313] AArch64: Regenerate cost model tests

Broken by 831e79adff4506a0b22a770dcaa46bf5a37257cb, though
presubmit was somehow green.
---
 llvm/test/Analysis/CostModel/AArch64/sincos.ll | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
index 72c8f2bbbf8cf..48537f6012dd5 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -38,14 +38,14 @@ define void @sincos() {
 ;
 ; SINCOS_STRET-LABEL: 'sincos'
 ; SINCOS_STRET:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
-; SINCOS_STRET:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
-; SINCOS_STRET:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
 ; SINCOS_STRET:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
-; SINCOS_STRET:  Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
-; SINCOS_STRET:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
-; SINCOS_STRET:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
 ; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)

From dc94f2cbadfd192fe3d43bd00fd5a1d0ead5ab8d Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Tue, 4 Nov 2025 20:15:47 +0100
Subject: [PATCH 217/313] [Offload] Add device UID (#164391)

Introduced in OpenMP 6.0, the device UID shall be a unique identifier of
a device on a given system. (Not necessarily a UUID.) Since it is not
guaranteed that the (U)UIDs defined by the device vendor libraries, such
as HSA, do not overlap with those of other vendors, the device UIDs in
offload are always combined with the offload plugin name. In case the
vendor library does not specify any device UID for a given device, we
fall back to the offload-internal device ID.
The device UID can be retrieved using the `llvm-offload-device-info`
tool.
---
 offload/liboffload/API/Device.td              |  1 +
 offload/liboffload/src/OffloadImpl.cpp        |  7 +++++--
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |  1 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 14 +++++++++++++
 .../common/include/PluginInterface.h          | 19 ++++++++++++++++-
 .../common/src/PluginInterface.cpp            | 21 ++++++++++++++++---
 .../cuda/dynamic_cuda/cuda.cpp                |  1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  4 ++++
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  7 +++++++
 .../deviceinfo/llvm-offload-device-info.cpp   |  1 +
 .../OffloadAPI/device/olGetDeviceInfo.cpp     | 20 ++++++++++++++++++
 .../OffloadAPI/device/olGetDeviceInfoSize.cpp |  1 +
 12 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 5b54c79d83f9d..e9c154818c4a1 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -29,6 +29,7 @@ def ol_device_info_t : Enum {
     TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">,
     TaggedEtor<"NAME", "char[]", "Device name">,
     TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">,
+    TaggedEtor<"UID", "char[]", "Device UID">,
     TaggedEtor<"VENDOR", "char[]", "Device vendor">,
     TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 6d22faeb0e57e..84bc414396811 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -147,8 +147,8 @@ llvm::Error ol_platform_impl_t::init() {
     if (llvm::Error Err = Plugin->initDevice(Id))
       return Err;
 
-    auto Device = &Plugin->getDevice(Id);
-    auto Info = Device->obtainInfoImpl();
+    GenericDeviceTy *Device = &Plugin->getDevice(Id);
+    llvm::Expected<InfoTreeNode> Info = Device->obtainInfo();
     if (llvm::Error Err = Info.takeError())
       return Err;
     Devices.emplace_back(std::make_unique<ol_device_impl_t>(Id, Device, *this,
@@ -467,6 +467,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   switch (PropName) {
   case OL_DEVICE_INFO_NAME:
   case OL_DEVICE_INFO_PRODUCT_NAME:
+  case OL_DEVICE_INFO_UID:
   case OL_DEVICE_INFO_VENDOR:
   case OL_DEVICE_INFO_DRIVER_VERSION: {
     // String values
@@ -544,6 +545,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
     return Info.writeString("Virtual Host Device");
   case OL_DEVICE_INFO_PRODUCT_NAME:
     return Info.writeString("Virtual Host Device");
+  case OL_DEVICE_INFO_UID:
+    return Info.writeString(GenericPluginTy::getHostDeviceUid());
   case OL_DEVICE_INFO_VENDOR:
     return Info.writeString("Liboffload");
   case OL_DEVICE_INFO_DRIVER_VERSION:
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..ddfa65c76cf2d 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -72,6 +72,7 @@ typedef enum hsa_amd_agent_info_s {
   HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
   HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
   HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
+  HSA_AMD_AGENT_INFO_UUID = 0xA011,
   HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
 } hsa_amd_agent_info_t;
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 0b03ef534d273..928c6cd7569e3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2083,6 +2083,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
     ComputeUnitKind = GPUName;
 
+    // From the ROCm HSA documentation:
+    // Query the UUID of the agent. The value is an Ascii string with a maximum
+    // of 21 chars including NUL. The string value consists of two parts: header
+    // and body. The header identifies the device type (GPU, CPU, DSP) while the
+    // body encodes the UUID as a 16 digit hex string.
+    //
+    // Agents that do not support UUID will return the string "GPU-XX" or
+    // "CPU-XX" or "DSP-XX" depending on their device type.
+    char UUID[24] = {0};
+    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_UUID, UUID))
+      return Err;
+    if (!StringRef(UUID).ends_with("-XX"))
+      setDeviceUidFromVendorUid(UUID);
+
     // Get the wavefront size.
     uint32_t WavefrontSize = 0;
     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f9bff9abd903c..f9dcdea7213fd 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -791,6 +791,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this id is not unique between different plugins; they may overlap.
   int32_t getDeviceId() const { return DeviceId; }
 
+  /// Get the unique identifier of the device.
+  const char *getDeviceUid() const { return DeviceUid.c_str(); }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -989,9 +992,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
 
+  /// Obtain information about the device.
+  Expected<InfoTreeNode> obtainInfo();
+  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
+
   /// Print information about the device.
   Error printInfo();
-  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
 
   /// Return true if the device has work that is either queued or currently
   /// running
@@ -1204,6 +1210,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// global device id and is not the device id visible to the OpenMP user.
   const int32_t DeviceId;
 
+  /// The unique identifier of the device.
+  /// Per default, the unique identifier of the device is set to the device id,
+  /// combined with the plugin name, since the offload device id may overlap
+  /// between different plugins.
+  std::string DeviceUid;
+  /// Construct the device UID from the vendor (U)UID.
+  void setDeviceUidFromVendorUid(StringRef VendorUid);
+
   /// The default grid values used for this device.
   llvm::omp::GV GridValues;
 
@@ -1290,6 +1304,9 @@ struct GenericPluginTy {
     return UserDeviceIds.at(DeviceId);
   }
 
+  /// Get the UID for the host device.
+  static constexpr const char *getHostDeviceUid() { return "HOST"; }
+
   /// Get the ELF code to recognize the binary image of this plugin.
   virtual uint16_t getMagicElfBits() const = 0;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 36d643b65922d..d7e5a21600abf 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -715,6 +715,10 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
       PinnedAllocs(*this), RPCServer(nullptr) {
+  // Conservative fall-back to the plugin's device uid for the case that no real
+  // vendor (u)uid will become available later.
+  setDeviceUidFromVendorUid(std::to_string(static_cast<uint64_t>(DeviceId)));
+
 #ifdef OMPT_SUPPORT
   OmptInitialized.store(false);
   // Bind the callbacks to this device's member functions
@@ -1524,15 +1528,22 @@ Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
   return Err;
 }
 
+Expected<InfoTreeNode> GenericDeviceTy::obtainInfo() {
+  auto InfoOrErr = obtainInfoImpl();
+  if (InfoOrErr)
+    InfoOrErr->add("UID", getDeviceUid(), "", DeviceInfo::UID);
+  return InfoOrErr;
+}
+
 Error GenericDeviceTy::printInfo() {
-  auto Info = obtainInfoImpl();
+  auto InfoOrErr = obtainInfo();
 
   // Get the vendor-specific info entries describing the device properties.
-  if (auto Err = Info.takeError())
+  if (auto Err = InfoOrErr.takeError())
     return Err;
 
   // Print all info entries.
-  Info->print();
+  InfoOrErr->print();
 
   return Plugin::success();
 }
@@ -1603,6 +1614,10 @@ Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
   return isAccessiblePtrImpl(Ptr, Size);
 }
 
+void GenericDeviceTy::setDeviceUidFromVendorUid(StringRef VendorUid) {
+  DeviceUid = std::string(Plugin.getName()) + "-" + std::string(VendorUid);
+}
+
 Error GenericPluginTy::init() {
   if (Initialized)
     return Plugin::success();
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index f5b2d074a47e7..e7a1ca38b3c13 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -35,6 +35,7 @@ DLWRAP(cuFuncSetAttribute, 3)
 
 // Device info
 DLWRAP(cuDeviceGetName, 3)
+DLWRAP(cuDeviceGetUuid, 2)
 DLWRAP(cuDeviceTotalMem, 2)
 DLWRAP(cuDriverGetVersion, 1)
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index dec4e33508c62..a470d6df1079d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -33,6 +33,9 @@ typedef struct CUfunc_st *CUfunction;
 typedef void (*CUhostFn)(void *userData);
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
+typedef struct CUuuid_st {
+  char bytes[16];
+} CUuuid;
 
 #define CU_DEVICE_INVALID ((CUdevice)(-2))
 
@@ -301,6 +304,7 @@ CUresult cuFuncSetAttribute(CUfunction, CUfunction_attribute, int);
 
 // Device info
 CUresult cuDeviceGetName(char *, int, CUdevice);
+CUresult cuDeviceGetUuid(CUuuid *, CUdevice);
 CUresult cuDeviceTotalMem(size_t *, CUdevice);
 CUresult cuDriverGetVersion(int *);
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index db94f7f2dd995..a9adcc397fb7b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -25,6 +25,7 @@
 #include "PluginInterface.h"
 #include "Utils/ELF.h"
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
@@ -293,6 +294,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
       return Err;
 
+    CUuuid UUID = {0};
+    Res = cuDeviceGetUuid(&UUID, Device);
+    if (auto Err = Plugin::check(Res, "error in cuDeviceGetUuid: %s"))
+      return Err;
+    setDeviceUidFromVendorUid(toHex(UUID.bytes, true));
+
     // Query the current flags of the primary context and set its flags if
     // it is inactive.
     unsigned int FormerPrimaryCtxFlags = 0;
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 9b58d67f017ca..42ffb97d6d77c 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -176,6 +176,7 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
       printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name"));
   OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME,
                                              "Product Name"));
+  OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_UID, "UID"));
   OFFLOAD_ERR(
       printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type"));
   OFFLOAD_ERR(printDeviceValue<const char *>(
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 8cb0b8065c33e..30eafee026316 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -98,6 +98,16 @@ TEST_P(olGetDeviceInfoTest, SuccessProductName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, SuccessUID) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_UID, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> UID;
+  UID.resize(Size);
+  ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_UID, Size, UID.data()));
+  ASSERT_EQ(std::strlen(UID.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, HostProductName) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
@@ -109,6 +119,16 @@ TEST_P(olGetDeviceInfoTest, HostProductName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, HostUID) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_UID, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> UID;
+  UID.resize(Size);
+  ASSERT_SUCCESS(olGetDeviceInfo(Host, OL_DEVICE_INFO_UID, Size, UID.data()));
+  ASSERT_EQ(std::strlen(UID.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, SuccessVendor) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index c4a3c2d5e3c75..79a18c1d133dc 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -32,6 +32,7 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t,
                             OL_DEVICE_INFO_PLATFORM);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(UID, OL_DEVICE_INFO_UID);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION);
 OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t,

From e5f191e171720b413f83bff13b61d500369f560d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 11:21:32 -0800
Subject: [PATCH 218/313] DeclareRuntimeLibcalls: Add registered target checks
 to test run lines

---
 .../Util/DeclareRuntimeLibcalls/sincos_stret.ll    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
index 0d0e3da25eea7..f0f09e97d9dba 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -1,11 +1,11 @@
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s
+; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %}
 
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s
-; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %}
 
 ; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
 ; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]

From 5be12e1c95e897a9da713d49255868eea7ad60fa Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 4 Nov 2025 20:23:33 +0100
Subject: [PATCH 219/313] [LLDB] Run API tests with PDB too (#149305)

From
https://github.com/llvm/llvm-project/pull/148554#issuecomment-3083261858
- this adds an option for API tests to be run with the both PDB readers
on Windows. As there are a lot of failures with PDB, this is an opt-in
per test.

To get PDB, `-g -gcodeview` has to be used on Clang. `-gcodeview` alone
isn't enough, because it won't cause clang to pass `-debug` to the
linker.

#149498 tracks the (currently) failing tests.
---
 .../Python/lldbsuite/test/builders/builder.py  |  1 +
 .../packages/Python/lldbsuite/test/lldbtest.py | 12 ++++++++++++
 .../Python/lldbsuite/test/make/Makefile.rules  |  4 ++++
 .../Python/lldbsuite/test/test_categories.py   | 11 ++++++++++-
 lldb/test/API/test_utils/pdb/Makefile          |  3 +++
 lldb/test/API/test_utils/pdb/TestPdb.py        | 18 ++++++++++++++++++
 lldb/test/API/test_utils/pdb/main.cpp          |  1 +
 7 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/API/test_utils/pdb/Makefile
 create mode 100644 lldb/test/API/test_utils/pdb/TestPdb.py
 create mode 100644 lldb/test/API/test_utils/pdb/main.cpp

diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 96c7b3987d8a1..024c9f1c7e435 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -258,6 +258,7 @@ def _getDebugInfoArgs(self, debug_info):
             "gmodules": {"MAKE_DSYM": "NO", "MAKE_GMODULES": "YES"},
             "debug_names": {"MAKE_DEBUG_NAMES": "YES"},
             "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"},
+            "pdb": {"MAKE_PDB": "YES"},
         }
 
         # Collect all flags, with later options overriding earlier ones
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index b92de941c4124..8c1eea97620e2 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1791,6 +1791,11 @@ def no_reason(_):
                         if can_replicate
                     ]
 
+                    # PDB is off by default, because it has a lot of failures right now.
+                    # See llvm.org/pr149498
+                    if original_testcase.TEST_WITH_PDB_DEBUG_INFO:
+                        dbginfo_categories.append("pdb")
+
                 xfail_for_debug_info_cat_fn = getattr(
                     attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason
                 )
@@ -1878,6 +1883,13 @@ class TestBase(Base, metaclass=LLDBTestCaseFactory):
     # test multiple times with various debug info types.
     NO_DEBUG_INFO_TESTCASE = False
 
+    TEST_WITH_PDB_DEBUG_INFO = False
+    """
+    Subclasses can set this to True to test with PDB in addition to the other debug info
+    types. This id off by default because many tests will fail due to missing functionality in PDB.
+    See llvm.org/pr149498.
+    """
+
     def generateSource(self, source):
         template = source + ".template"
         temp = os.path.join(self.getSourceDir(), template)
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 28cae54776ac8..b3822db162a0b 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -249,6 +249,10 @@ ifeq ($(CC_TYPE), clang)
    MODULE_DEBUG_INFO_FLAGS += -gmodules
 endif
 
+ifeq "$(MAKE_PDB)" "YES"
+	DEBUG_INFO_FLAG ?= -g -gcodeview
+endif
+
 # If the OS is Windows, we need to pass -gdwarf to clang, otherwise it will build
 # with codeview by default but all the tests rely on dwarf.
 ifeq "$(OS)" "Windows_NT"
diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py
index 1f6e8a78e0c0d..b8a764fb3349a 100644
--- a/lldb/packages/Python/lldbsuite/test/test_categories.py
+++ b/lldb/packages/Python/lldbsuite/test/test_categories.py
@@ -12,7 +12,13 @@
 
 # Key: Category name
 # Value: should be used in lldbtest's debug-info replication
-debug_info_categories = {"dwarf": True, "dwo": True, "dsym": True, "gmodules": False}
+debug_info_categories = {
+    "dwarf": True,
+    "dwo": True,
+    "dsym": True,
+    "pdb": False,
+    "gmodules": False,
+}
 
 all_categories = {
     "basic_process": "Basic process execution sniff tests.",
@@ -34,6 +40,7 @@
     "lldb-dap": "Tests for the Debug Adapter Protocol with lldb-dap",
     "llgs": "Tests for the gdb-server functionality of lldb-server",
     "msvcstl": "Test for MSVC STL data formatters",
+    "pdb": "Tests that can be run with PDB debug information",
     "pexpect": "Tests requiring the pexpect library to be available",
     "objc": "Tests related to the Objective-C programming language support",
     "pyapi": "Tests related to the Python API",
@@ -65,6 +72,8 @@ def is_supported_on_platform(category, platform, compiler_path):
         if platform not in ["darwin", "macosx", "ios", "watchos", "tvos", "bridgeos"]:
             return False
         return gmodules.is_compiler_clang_with_gmodules(compiler_path)
+    elif category == "pdb":
+        return platform == "windows"
     return True
 
 
diff --git a/lldb/test/API/test_utils/pdb/Makefile b/lldb/test/API/test_utils/pdb/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/test_utils/pdb/TestPdb.py b/lldb/test/API/test_utils/pdb/TestPdb.py
new file mode 100644
index 0000000000000..bd3a9d0c34ab3
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/TestPdb.py
@@ -0,0 +1,18 @@
+"""
+Test PDB enabled tests
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestBuildMethod(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
+    def test(self):
+        self.build()
+        self.assertTrue(self.dbg.CreateTarget(self.getBuildArtifact()))
+        if self.getDebugInfo() == "pdb":
+            self.expect(
+                "target modules dump symfile", patterns=["SymbolFile (native-)?pdb"]
+            )
diff --git a/lldb/test/API/test_utils/pdb/main.cpp b/lldb/test/API/test_utils/pdb/main.cpp
new file mode 100644
index 0000000000000..76e8197013aab
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/main.cpp
@@ -0,0 +1 @@
+int main() { return 0; }

From 0ae0ac0f004d4028051ba01335b02ebadd2b8b4d Mon Sep 17 00:00:00 2001
From: Ross Burton <ross.burton@arm.com>
Date: Tue, 4 Nov 2025 19:26:02 +0000
Subject: [PATCH 220/313] [cmake] Pass PYTHON_EXECUTABLE to native builds
 (#163574)

Ensure that the nested native build uses the same python interpreter as
the main build, in case the python that CMake detects first is not the
python that the user has specified explicitly.

For example, if the person building LLVM wants to use a different python
interpreter to build (eg, testing the build with `python3.14` when
`python3` is a link to `python3.8`, or the default python doesn't have
development headers available) then they could add
`-DPYTHON_EXECUTABLE=python3.14` when invoking CMake. This should be
forwarded to the native CMake build to ensure that the same python is
used.

Original fix by Anuj Mittal <anuj.mittal@intel.com>.
---
 llvm/cmake/modules/CrossCompile.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index bfbd9cfd4063f..2a69c5133c56f 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -101,6 +101,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
         -DLLVM_INCLUDE_BENCHMARKS=OFF
         -DLLVM_INCLUDE_TESTS=OFF
         -DLLVM_TABLEGEN_FLAGS="${LLVM_TABLEGEN_FLAGS}"
+        -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}"
         ${build_type_flags} ${linker_flag} ${external_clang_dir} ${libc_flags}
         ${ARGN}
     WORKING_DIRECTORY ${${project_name}_${target_name}_BUILD}

From 4776451693f4a6bd18e50106edb4b3cfa766484f Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 4 Nov 2025 11:27:38 -0800
Subject: [PATCH 221/313] [X86][NewPM] Port lower-amx-intrinsics to NewPM

Reviewers: paperchalice, phoebewang, arsenm

Reviewed By: arsenm

Pull Request: https://github.com/llvm/llvm-project/pull/165113
---
 llvm/lib/Target/X86/X86.h                     | 13 ++++-
 llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 48 +++++++++++++++----
 llvm/lib/Target/X86/X86PassRegistry.def       |  2 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |  2 +-
 .../AMX/amx-low-intrinsics-no-amx-bitcast.ll  |  3 +-
 .../CodeGen/X86/AMX/amx-low-intrinsics.ll     |  3 +-
 6 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a7a51d0..bdb43cfb4adb4 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -179,7 +179,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
 
 /// The pass transforms amx intrinsics to scalar operation if the function has
 /// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+    : public PassInfoMixin<X86LowerAMXIntrinsicsPass> {
+private:
+  const TargetMachine *TM;
+
+public:
+  X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
 
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   const X86Subtarget &,
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f3393910da2c..662aec2c15241 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -40,7 +43,7 @@
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
 
 #ifndef NDEBUG
 static bool isV256I32Ty(Type *Ty) {
@@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() {
   return C;
 }
 
+namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+  return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+                             TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  X86LowerAMXIntrinsics LAT(F, DTU, LI);
+  return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &FAM) {
+  if (!shouldRunLowerAMXIntrinsics(F, TM))
+    return PreservedAnalyses::all();
+
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
 namespace {
 class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
 public:
@@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
   X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
-    if (!X86ScalarizeAMX)
-      return false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-    if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
-        TM->getOptLevel() != CodeGenOptLevel::None)
+    if (!shouldRunLowerAMXIntrinsics(F, TM))
       return false;
 
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
     auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-    X86LowerAMXIntrinsics LAT(F, DTU, LI);
-    return LAT.visit();
+    return runLowerAMXIntrinsics(F, DT, LI);
   }
   StringRef getPassName() const override { return "Lower AMX intrinsics"; }
 
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
                     false, false)
 
-FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
+FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() {
   return new X86LowerAMXIntrinsicsLegacyPass();
 }
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index fc25d55d3059a..81c98febc4ba8 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -15,13 +15,13 @@
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
 FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
 #undef FUNCTION_PASS
 
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
-DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
 DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
 DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
 #undef DUMMY_FUNCTION_PASS
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..bf4dab0371b88 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -422,7 +422,7 @@ void X86PassConfig::addIRPasses() {
 
   // We add both pass anyway and when these two passes run, we skip the pass
   // based on the option level and option attribute.
-  addPass(createX86LowerAMXIntrinsicsPass());
+  addPass(createX86LowerAMXIntrinsicsLegacyPass());
   addPass(createX86LowerAMXTypeLegacyPass());
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 87059c5d474e6..6ae7b2260c15c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
 
 define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 5fb2dcdc1d621..ca7c3573a3294 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
 
 define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
 ; CHECK-LABEL: @test_amx_load_non_O0(

From e03fac127c8db8a988c8346295f9862dd59c42fc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 4 Nov 2025 11:28:27 -0800
Subject: [PATCH 222/313] [X86][NewPM] Port X86PartialReduction to NewPM

There are no tests that specifically stop/start at
x86-partial-reduction, so no test cases have been updated for this
patch.

Reviewers: phoebewang, paperchalice, RKSimon, arsenm

Reviewed By: arsenm, RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/166048
---
 llvm/lib/Target/X86/X86.h                   | 13 +++-
 llvm/lib/Target/X86/X86PartialReduction.cpp | 72 +++++++++++++--------
 llvm/lib/Target/X86/X86PassRegistry.def     |  2 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp    |  4 +-
 4 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index bdb43cfb4adb4..fa23656e23fc3 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass();
 /// This pass optimizes arithmetic based on knowledge that is only used by
 /// a reduction sequence and is therefore safe to reassociate in interesting
 /// ways.
-FunctionPass *createX86PartialReductionPass();
+class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> {
+private:
+  const X86TargetMachine *TM;
+
+public:
+  X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+FunctionPass *createX86PartialReductionLegacyPass();
 
 /// // Analyzes and emits pseudos to support Win x64 Unwind V2.
 FunctionPass *createX86WinEHUnwindV2Pass();
@@ -231,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
 void initializeX86LowerTileCopyPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
-void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86PartialReductionLegacyPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
 void initializeX86ReturnThunksPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index a25e4e0f464a4..898c83cf9b468 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -16,10 +16,12 @@
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
@@ -30,39 +32,44 @@ using namespace llvm;
 
 namespace {
 
-class X86PartialReduction : public FunctionPass {
+class X86PartialReduction {
+  const X86TargetMachine *TM;
   const DataLayout *DL = nullptr;
   const X86Subtarget *ST = nullptr;
 
+public:
+  X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {}
+  bool run(Function &F);
+
+private:
+  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
+  bool trySADReplacement(Instruction *Op);
+};
+
+class X86PartialReductionLegacy : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
 
-  X86PartialReduction() : FunctionPass(ID) { }
+  X86PartialReductionLegacy() : FunctionPass(ID) {}
 
-  bool runOnFunction(Function &Fn) override;
+  bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
   }
 
-  StringRef getPassName() const override {
-    return "X86 Partial Reduction";
-  }
-
-private:
-  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
-  bool trySADReplacement(Instruction *Op);
+  StringRef getPassName() const override { return "X86 Partial Reduction"; }
 };
 }
 
-FunctionPass *llvm::createX86PartialReductionPass() {
-  return new X86PartialReduction();
+FunctionPass *llvm::createX86PartialReductionLegacyPass() {
+  return new X86PartialReductionLegacy();
 }
 
-char X86PartialReduction::ID = 0;
+char X86PartialReductionLegacy::ID = 0;
 
-INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
-                "X86 Partial Reduction", false, false)
+INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction",
+                false, false)
 
 // This function should be aligned with detectExtMul() in X86ISelLowering.cpp.
 static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul,
@@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
   }
 }
 
-bool X86PartialReduction::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  auto &TM = TPC->getTM<X86TargetMachine>();
-  ST = TM.getSubtargetImpl(F);
-
+bool X86PartialReduction::run(Function &F) {
+  ST = TM->getSubtargetImpl(F);
   DL = &F.getDataLayout();
 
   bool MadeChange = false;
@@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) {
 
   return MadeChange;
 }
+
+bool X86PartialReductionLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F);
+}
+
+PreservedAnalyses X86PartialReductionPass::run(Function &F,
+                                               FunctionAnalysisManager &FAM) {
+  bool Changed = X86PartialReduction(TM).run(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index 81c98febc4ba8..db255940f8829 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -17,12 +17,12 @@
 #endif
 FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
 FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
+FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this))
 #undef FUNCTION_PASS
 
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
-DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
 DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
 #undef DUMMY_FUNCTION_PASS
 
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index bf4dab0371b88..5f0bcab251e61 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
   initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
   initializeX86LoadValueInjectionRetHardeningPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
-  initializeX86PartialReductionPass(PR);
+  initializeX86PartialReductionLegacyPass(PR);
   initializePseudoProbeInserterPass(PR);
   initializeX86ReturnThunksPass(PR);
   initializeX86DAGToDAGISelLegacyPass(PR);
@@ -429,7 +429,7 @@ void X86PassConfig::addIRPasses() {
 
   if (TM->getOptLevel() != CodeGenOptLevel::None) {
     addPass(createInterleavedAccessPass());
-    addPass(createX86PartialReductionPass());
+    addPass(createX86PartialReductionLegacyPass());
   }
 
   // Add passes that handle indirect branch removal and insertion of a retpoline

From 7272a6c8882d99fe1fc73d2c69ddf976948f0e50 Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Tue, 4 Nov 2025 11:32:12 -0800
Subject: [PATCH 223/313] [BPF] Avoid relocation for jumptable entries
 (#166301)

Currently, the jump table entry contains labels only. For example, the
following is one example:

  BPF.JT.0.0:
        .quad   LBB0_1
        .quad   LBB0_2
        .size   BPF.JT.0.0, 16

Since the jump table entry contains a label, the relocation is necessary
so linker can resolve the label value. The relocation looks like below:

  Relocation section '.rel.jumptables' at offset 0x160 contains 2 entries:
      Offset             Info             Type               Symbol's Value  Symbol's Name
  0000000000000000  0000000200000002 R_BPF_64_ABS64         0000000000000000 .text
  0000000000000008  0000000200000002 R_BPF_64_ABS64         0000000000000000 .text

You can see that the symbol value is 0 which makes .rel.jumptables not
very useful.

Instead of having the label itself in the jump table entry, use the
difference of label and the section begin symbol. This can avoid the
relocation and the eventual jumptable entries in object file remains the
same as before.

  Hex dump of section '.jumptables':
  0x00000000 68000000 00000000 78000000 00000000 h.......x.......
---
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp         | 10 +++-
 llvm/test/CodeGen/BPF/jump_table_blockaddr.ll |  4 +-
 .../test/CodeGen/BPF/jump_table_global_var.ll |  4 +-
 .../CodeGen/BPF/jump_table_switch_stmt.ll     | 60 +++++++++----------
 4 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index b2a82040ee823..378a72ab27dd5 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -219,6 +219,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
 
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
   const Function &F = MF->getFunction();
+
+  MCSection *Sec = OutStreamer->getCurrentSectionOnly();
+  MCSymbol *SecStart = Sec->getBeginSymbol();
+
   MCSection *JTS = TLOF.getSectionForJumpTable(F, TM);
   assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress);
   unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
@@ -231,8 +235,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
     MCSymbol *JTStart = getJTPublicSymbol(JTI);
     OutStreamer->emitLabel(JTStart);
     for (const MachineBasicBlock *MBB : JTBBs) {
-      const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
-      OutStreamer->emitValue(LHS, EntrySize);
+      const MCExpr *Diff = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(MBB->getSymbol(), OutContext),
+          MCSymbolRefExpr::create(SecStart, OutContext), OutContext);
+      OutStreamer->emitValue(Diff, EntrySize);
     }
     const MCExpr *JTSize =
         MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext);
diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
index d5a1d63b644a8..b7d518639d70e 100644
--- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
@@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_3
+; CHECK: 	.quad	LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 8
 ; CHECK: BPF.JT.0.1:
-; CHECK: 	.quad	LBB0_4
+; CHECK: 	.quad	LBB0_4-.text
 ; CHECK: 	.size	BPF.JT.0.1, 8
diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
index bbca46850843b..71c682f5530ed 100644
--- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
@@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_1
-; CHECK: 	.quad	LBB0_2
+; CHECK: 	.quad	LBB0_1-.text
+; CHECK: 	.quad	LBB0_2-.text
 ; CHECK: 	.size	BPF.JT.0.0, 16
diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
index 682b025d665d6..eb1e5bff11013 100644
--- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
@@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_4
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_2
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_3
+; CHECK: 	.quad	LBB0_4-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_2-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 240

From a99e13271e25269f0feb7b256676594d315fa381 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Tue, 4 Nov 2025 22:32:43 +0300
Subject: [PATCH 224/313] [Docs][clang-tools-extra] Convert maintainers list to
 .rst format (#165171)

[Clang maintainers
list](https://github.com/llvm/llvm-project/blob/main/clang/Maintainers.rst)
is already in `.rst` format, which gives nice visuals.
I think we should convert clang-tools-extra maintainers too to `.rst`.
---
 clang-tools-extra/{Maintainers.txt => Maintainers.rst} | 8 ++++++--
 clang-tools-extra/docs/Maintainers.rst                 | 1 +
 clang-tools-extra/docs/index.rst                       | 1 +
 3 files changed, 8 insertions(+), 2 deletions(-)
 rename clang-tools-extra/{Maintainers.txt => Maintainers.rst} (91%)
 create mode 100644 clang-tools-extra/docs/Maintainers.rst

diff --git a/clang-tools-extra/Maintainers.txt b/clang-tools-extra/Maintainers.rst
similarity index 91%
rename from clang-tools-extra/Maintainers.txt
rename to clang-tools-extra/Maintainers.rst
index 43dfd48ad1f57..2603ebadf529c 100644
--- a/clang-tools-extra/Maintainers.txt
+++ b/clang-tools-extra/Maintainers.rst
@@ -2,9 +2,13 @@
 Clang Tools Extra Maintainers
 =============================
 
-This file is a list of the maintainers
-(https://llvm.org/docs/DeveloperPolicy.html#maintainers) for clang-tools-extra.
+This file is a list of the 
+`maintainers <https://llvm.org/docs/DeveloperPolicy.html#maintainers>`_
+for `Extra Clang Tools <https://clang.llvm.org/extra/index.html>`_ project.
 
+.. contents::
+   :depth: 2
+   :local:
 
 Active Maintainers
 ==================
diff --git a/clang-tools-extra/docs/Maintainers.rst b/clang-tools-extra/docs/Maintainers.rst
new file mode 100644
index 0000000000000..f78e9ecf279a6
--- /dev/null
+++ b/clang-tools-extra/docs/Maintainers.rst
@@ -0,0 +1 @@
+.. include:: ../Maintainers.rst
\ No newline at end of file
diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst
index 3f3a99d1b70c6..eba4a2cdbc558 100644
--- a/clang-tools-extra/docs/index.rst
+++ b/clang-tools-extra/docs/index.rst
@@ -22,6 +22,7 @@ Contents
    pp-trace
    clangd <https://clangd.llvm.org/>
    clang-doc
+   Maintainers
 
 
 Doxygen Documentation

From 8f683c3e4b6fe939a7d0f1167934aa823a889267 Mon Sep 17 00:00:00 2001
From: choikwa <5455710+choikwa@users.noreply.github.com>
Date: Tue, 4 Nov 2025 14:34:54 -0500
Subject: [PATCH 225/313] [AMDGPU] NFC, delete promote-alloca testcase
 (#166297)

previous merge did not delete.
---
 .../AMDGPU/promote-alloca-array-to-vector.ll  | 325 ------------------
 1 file changed, 325 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll

diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
deleted file mode 100644
index 05a0e39d4a715..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
+++ /dev/null
@@ -1,325 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
-
-define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
-; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users(
-; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; OPT-NEXT:  [[ENTRY:.*:]]
-; OPT-NEXT:    [[ALLOCA:%.*]] = freeze <128 x i8> poison
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
-; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
-; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
-; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
-; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
-; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
-; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
-; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
-; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
-; OPT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0
-; OPT-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1
-; OPT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2
-; OPT-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3
-; OPT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4
-; OPT-NEXT:    [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5
-; OPT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6
-; OPT-NEXT:    [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7
-; OPT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8
-; OPT-NEXT:    [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9
-; OPT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10
-; OPT-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11
-; OPT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12
-; OPT-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13
-; OPT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14
-; OPT-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15
-; OPT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0
-; OPT-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1
-; OPT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2
-; OPT-NEXT:    [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3
-; OPT-NEXT:    [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4
-; OPT-NEXT:    [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5
-; OPT-NEXT:    [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6
-; OPT-NEXT:    [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7
-; OPT-NEXT:    [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8
-; OPT-NEXT:    [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9
-; OPT-NEXT:    [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10
-; OPT-NEXT:    [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11
-; OPT-NEXT:    [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12
-; OPT-NEXT:    [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13
-; OPT-NEXT:    [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14
-; OPT-NEXT:    [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15
-; OPT-NEXT:    [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0
-; OPT-NEXT:    [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1
-; OPT-NEXT:    [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2
-; OPT-NEXT:    [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3
-; OPT-NEXT:    [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4
-; OPT-NEXT:    [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5
-; OPT-NEXT:    [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6
-; OPT-NEXT:    [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7
-; OPT-NEXT:    [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8
-; OPT-NEXT:    [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9
-; OPT-NEXT:    [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10
-; OPT-NEXT:    [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11
-; OPT-NEXT:    [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12
-; OPT-NEXT:    [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13
-; OPT-NEXT:    [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14
-; OPT-NEXT:    [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15
-; OPT-NEXT:    [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0
-; OPT-NEXT:    [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1
-; OPT-NEXT:    [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2
-; OPT-NEXT:    [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3
-; OPT-NEXT:    [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4
-; OPT-NEXT:    [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5
-; OPT-NEXT:    [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6
-; OPT-NEXT:    [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7
-; OPT-NEXT:    [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8
-; OPT-NEXT:    [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9
-; OPT-NEXT:    [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10
-; OPT-NEXT:    [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11
-; OPT-NEXT:    [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12
-; OPT-NEXT:    [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13
-; OPT-NEXT:    [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14
-; OPT-NEXT:    [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15
-; OPT-NEXT:    [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0
-; OPT-NEXT:    [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1
-; OPT-NEXT:    [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2
-; OPT-NEXT:    [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3
-; OPT-NEXT:    [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4
-; OPT-NEXT:    [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5
-; OPT-NEXT:    [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6
-; OPT-NEXT:    [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7
-; OPT-NEXT:    [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8
-; OPT-NEXT:    [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9
-; OPT-NEXT:    [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10
-; OPT-NEXT:    [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11
-; OPT-NEXT:    [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12
-; OPT-NEXT:    [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13
-; OPT-NEXT:    [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14
-; OPT-NEXT:    [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15
-; OPT-NEXT:    [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0
-; OPT-NEXT:    [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1
-; OPT-NEXT:    [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2
-; OPT-NEXT:    [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3
-; OPT-NEXT:    [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4
-; OPT-NEXT:    [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5
-; OPT-NEXT:    [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6
-; OPT-NEXT:    [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7
-; OPT-NEXT:    [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8
-; OPT-NEXT:    [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9
-; OPT-NEXT:    [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10
-; OPT-NEXT:    [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11
-; OPT-NEXT:    [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12
-; OPT-NEXT:    [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13
-; OPT-NEXT:    [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14
-; OPT-NEXT:    [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15
-; OPT-NEXT:    [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0
-; OPT-NEXT:    [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1
-; OPT-NEXT:    [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2
-; OPT-NEXT:    [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3
-; OPT-NEXT:    [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4
-; OPT-NEXT:    [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5
-; OPT-NEXT:    [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6
-; OPT-NEXT:    [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7
-; OPT-NEXT:    [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8
-; OPT-NEXT:    [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9
-; OPT-NEXT:    [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10
-; OPT-NEXT:    [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11
-; OPT-NEXT:    [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12
-; OPT-NEXT:    [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13
-; OPT-NEXT:    [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14
-; OPT-NEXT:    [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15
-; OPT-NEXT:    [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80
-; OPT-NEXT:    [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0
-; OPT-NEXT:    [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81
-; OPT-NEXT:    [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1
-; OPT-NEXT:    [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82
-; OPT-NEXT:    [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2
-; OPT-NEXT:    [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83
-; OPT-NEXT:    [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3
-; OPT-NEXT:    [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84
-; OPT-NEXT:    [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4
-; OPT-NEXT:    [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85
-; OPT-NEXT:    [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5
-; OPT-NEXT:    [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86
-; OPT-NEXT:    [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6
-; OPT-NEXT:    [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87
-; OPT-NEXT:    [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7
-; OPT-NEXT:    [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88
-; OPT-NEXT:    [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8
-; OPT-NEXT:    [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89
-; OPT-NEXT:    [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9
-; OPT-NEXT:    [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90
-; OPT-NEXT:    [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10
-; OPT-NEXT:    [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91
-; OPT-NEXT:    [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11
-; OPT-NEXT:    [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92
-; OPT-NEXT:    [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12
-; OPT-NEXT:    [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93
-; OPT-NEXT:    [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13
-; OPT-NEXT:    [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94
-; OPT-NEXT:    [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14
-; OPT-NEXT:    [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95
-; OPT-NEXT:    [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15
-; OPT-NEXT:    [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]]
-; OPT-NEXT:    store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
-; OPT-NEXT:    ret void
-;
-entry:
-  %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5)
-  %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16
-  %sum = add <16 x i8> %load, %add
-  store <16 x i8> %sum, ptr addrspace(3) %out, align 16
-  ret void
-}
-
-attributes #0 = {"amdgpu-waves-per-eu"="2,2"}

From 2a65fab965d60939ebf5d2ba9dab03ac3a69d846 Mon Sep 17 00:00:00 2001
From: Kai Nacke <kai.peter.nacke@ibm.com>
Date: Tue, 4 Nov 2025 15:07:20 -0500
Subject: [PATCH 226/313] [z/OS] Improve compiler options on z/OS (#166415)

`_XPLATFORM_SOURCE` needs to be defined to improve source code
compatibility (e.g. for `O_CLOEXEC`). The define `_UNIX03_THREADS` can
be removed, because it is automatically set by `_XOPEN_SOURCE=600`.

See the documentation of feature test macros:
https://www.ibm.com/docs/en/zos/3.1.0?topic=files-feature-test-macros

Tested on z/OS 3.1 with the Open XL C/C++ 2.2 compiler.
---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index c450ee5a3d72e..f192cd05b5a34 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1264,10 +1264,10 @@ endif()
 # Build with _XOPEN_SOURCE on z/OS.
 if (CMAKE_SYSTEM_NAME MATCHES "OS390")
   add_compile_definitions(_XOPEN_SOURCE=600)
+  add_compile_definitions(_XPLATFORM_SOURCE) # Needed e.g. for O_CLOEXEC.
   add_compile_definitions(_OPEN_SYS) # Needed for process information.
   add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O.
   add_compile_definitions(_EXT) # Needed for file data.
-  add_compile_definitions(_UNIX03_THREADS) # Multithreading support.
   # Need to build LLVM as ASCII application.
   # This can't be a global setting because other projects may
   # need to be built in EBCDIC mode.

From 718818a5cb4ce10aca8852e4d6675bb28ff4eacd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tuomas=20K=C3=A4rn=C3=A4?= <tuomas.karna@intel.com>
Date: Tue, 4 Nov 2025 22:08:42 +0200
Subject: [PATCH 227/313] [MLIR][Linalg][Transform] Expose more args in
 VectorizeChildren[...] op's Python bindings (#166134)

Expose missing boolean arguments in
`VectorizeChildrenAndApplyPatternsOp` Python bindings.
---
 mlir/python/mlir/dialects/transform/structured.py     | 4 ++++
 mlir/test/python/dialects/transform_structured_ext.py | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index 14c7380e432f0..d9ab504f0de54 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -713,6 +713,8 @@ def __init__(
         disable_transfer_permutation_map_lowering_patterns: bool = False,
         vectorize_nd_extract: bool = False,
         vectorize_padding: bool = False,
+        flatten_1d_depthwise_conv: bool = False,
+        fold_type_extensions_into_contract: bool = False,
         loc=None,
         ip=None,
     ):
@@ -722,8 +724,10 @@ def __init__(
             target,
             disable_multi_reduction_to_contract_patterns=disable_multi_reduction_to_contract_patterns,
             disable_transfer_permutation_map_lowering_patterns=disable_transfer_permutation_map_lowering_patterns,
+            flatten_1d_depthwise_conv=flatten_1d_depthwise_conv,
             vectorize_nd_extract=vectorize_nd_extract,
             vectorize_padding=vectorize_padding,
+            fold_type_extensions_into_contract=fold_type_extensions_into_contract,
             loc=loc,
             ip=ip,
         )
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index d6b70dc9d1978..e58b7646316fc 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -627,12 +627,16 @@ def testVectorizeChildrenAndApplyPatternsAllAttrs(target):
         disable_transfer_permutation_map_lowering_patterns=True,
         vectorize_nd_extract=True,
         vectorize_padding=True,
+        flatten_1d_depthwise_conv=True,
+        fold_type_extensions_into_contract=True,
     )
     # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsAllAttrs
     # CHECK: transform.sequence
     # CHECK: = transform.structured.vectorize
     # CHECK-SAME: disable_multi_reduction_to_contract_patterns
     # CHECK-SAME: disable_transfer_permutation_map_lowering_patterns
+    # CHECK-SAME: flatten_1d_depthwise_conv
+    # CHECK-SAME: fold_type_extensions_into_contract
     # CHECK-SAME: vectorize_nd_extract
     # CHECK-SAME: vectorize_padding
 
@@ -646,12 +650,16 @@ def testVectorizeChildrenAndApplyPatternsNoAttrs(target):
         disable_transfer_permutation_map_lowering_patterns=False,
         vectorize_nd_extract=False,
         vectorize_padding=False,
+        flatten_1d_depthwise_conv=False,
+        fold_type_extensions_into_contract=False,
     )
     # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsNoAttrs
     # CHECK: transform.sequence
     # CHECK: = transform.structured.vectorize
     # CHECK-NOT: disable_multi_reduction_to_contract_patterns
     # CHECK-NOT: disable_transfer_permutation_map_lowering_patterns
+    # CHECK-NOT: flatten_1d_depthwise_conv
+    # CHECK-NOT: fold_type_extensions_into_contract
     # CHECK-NOT: vectorize_nd_extract
     # CHECK-NOT: vectorize_padding
 

From 6f91f588d98aab7a0deb9db76b7c44ae237ff4e0 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Tue, 4 Nov 2025 21:19:32 +0100
Subject: [PATCH 228/313] build_llvm_release.bat: Put the 32-bit sanitizers
 back (#166437)

Follow-up to bcb3d2f5122276ed9969fe2b2ef4428652800377.

Even though 32-bit win/asan is not well supported, we shouldn't drop it
without some discussion at least. Also, we probably shouldn't drop the
other sanitizers that are gated by COMPILER_RT_BUILD_SANITIZERS.

The tests no longer pass after switching to the runtimes build however
(I believe that build mode runs more of the tests?), so disable them.
---
 llvm/utils/release/build_llvm_release.bat | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 0764c7af86c0a..3aeedaa3288a2 100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -220,7 +220,6 @@ set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
 set cmake_flags=^
   %common_cmake_flags% ^
   -DLLVM_ENABLE_RPMALLOC=OFF ^
-  -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^
   -DPython3_ROOT_DIR=%PYTHONHOME% ^
   -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
   -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
@@ -230,7 +229,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 cd..
 
@@ -239,7 +238,6 @@ REM with forward slash.
 set all_cmake_flags=^
   %cmake_flags% ^
   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;" ^
-  -DCOMPILER_RT_BUILD_SANITIZERS=OFF ^
   %common_lldb_flags% ^
   -DPYTHON_HOME=%PYTHONHOME% ^
   -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
@@ -256,7 +254,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
+REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja package || exit /b 1
 cd ..

From 240fe7e8445eee0ee6bca9c537ede3fdcb820f24 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 4 Nov 2025 21:21:56 +0100
Subject: [PATCH 229/313] [CIR][NFC] EHScope & Cleanups Iterators and operators
 overloading (#165317)

Upstream EHScope & Cleanup iterators, helpers and operator overloading
as a prerequisite for #165158

Issue https://github.com/llvm/llvm-project/issues/154992
---
 clang/lib/CIR/CodeGen/CIRGenCleanup.cpp |  8 +--
 clang/lib/CIR/CodeGen/CIRGenCleanup.h   | 71 ++++++++++++++++++++++---
 clang/lib/CIR/CodeGen/EHScopeStack.h    |  8 +++
 3 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index 851328a7db680..437db306f3369 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
 
   assert(!cir::MissingFeatures::innermostEHScope());
 
-  EHCleanupScope *scope = new (buffer)
-      EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup);
+  EHCleanupScope *scope = new (buffer) EHCleanupScope(
+      size, branchFixups.size(), innermostNormalCleanup, innermostEHScope);
 
   if (isNormalCleanup)
     innermostNormalCleanup = stable_begin();
@@ -191,7 +191,9 @@ void EHScopeStack::popCleanup() {
 EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
   char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers));
   assert(!cir::MissingFeatures::innermostEHScope());
-  EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers);
+  EHCatchScope *scope =
+      new (buffer) EHCatchScope(numHandlers, innermostEHScope);
+  innermostEHScope = stable_begin();
   return scope;
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
index 61a09a59b05c0..a035d792ef6d1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -30,6 +30,8 @@ struct CatchTypeInfo {
 
 /// A protected scope for zero-cost EH handling.
 class EHScope {
+  EHScopeStack::stable_iterator enclosingEHScope;
+
   class CommonBitFields {
     friend class EHScope;
     unsigned kind : 3;
@@ -79,7 +81,10 @@ class EHScope {
 public:
   enum Kind { Cleanup, Catch, Terminate, Filter };
 
-  EHScope(Kind kind) { commonBits.kind = kind; }
+  EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
+      : enclosingEHScope(enclosingEHScope) {
+    commonBits.kind = kind;
+  }
 
   Kind getKind() const { return static_cast<Kind>(commonBits.kind); }
 
@@ -90,6 +95,10 @@ class EHScope {
     assert(!cir::MissingFeatures::ehstackBranches());
     return false;
   }
+
+  EHScopeStack::stable_iterator getEnclosingEHScope() const {
+    return enclosingEHScope;
+  }
 };
 
 /// A scope which attempts to handle some, possibly all, types of
@@ -111,6 +120,8 @@ class EHCatchScope : public EHScope {
 
     /// The catch handler for this type.
     mlir::Region *region;
+
+    bool isCatchAll() const { return type.rtti == nullptr; }
   };
 
 private:
@@ -118,12 +129,18 @@ class EHCatchScope : public EHScope {
 
   Handler *getHandlers() { return reinterpret_cast<Handler *>(this + 1); }
 
+  const Handler *getHandlers() const {
+    return reinterpret_cast<const Handler *>(this + 1);
+  }
+
 public:
   static size_t getSizeForNumHandlers(unsigned n) {
     return sizeof(EHCatchScope) + n * sizeof(Handler);
   }
 
-  EHCatchScope(unsigned numHandlers) : EHScope(Catch) {
+  EHCatchScope(unsigned numHandlers,
+               EHScopeStack::stable_iterator enclosingEHScope)
+      : EHScope(Catch, enclosingEHScope) {
     catchBits.numHandlers = numHandlers;
     assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?");
   }
@@ -136,6 +153,11 @@ class EHCatchScope : public EHScope {
     getHandlers()[i].region = region;
   }
 
+  const Handler &getHandler(unsigned i) const {
+    assert(i < getNumHandlers());
+    return getHandlers()[i];
+  }
+
   // Clear all handler blocks.
   // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a
   // 'takeHandler' or some such function which removes ownership from the
@@ -144,6 +166,10 @@ class EHCatchScope : public EHScope {
     // The blocks are owned by TryOp, nothing to delete.
   }
 
+  using iterator = const Handler *;
+  iterator begin() const { return getHandlers(); }
+  iterator end() const { return getHandlers() + getNumHandlers(); }
+
   static bool classof(const EHScope *scope) {
     return scope->getKind() == Catch;
   }
@@ -176,9 +202,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
   }
 
   EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth,
-                 EHScopeStack::stable_iterator enclosingNormal)
-      : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal),
-        fixupDepth(fixupDepth) {
+                 EHScopeStack::stable_iterator enclosingNormal,
+                 EHScopeStack::stable_iterator enclosingEH)
+      : EHScope(EHScope::Cleanup, enclosingEH),
+        enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) {
     // TODO(cir): When exception handling is upstreamed, isNormalCleanup and
     // isEHCleanup will be arguments to the constructor.
     cleanupBits.isNormalCleanup = true;
@@ -235,13 +262,45 @@ class EHScopeStack::iterator {
 
   EHScope *get() const { return reinterpret_cast<EHScope *>(ptr); }
 
+  EHScope *operator->() const { return get(); }
   EHScope &operator*() const { return *get(); }
+
+  iterator &operator++() {
+    size_t size;
+    switch (get()->getKind()) {
+    case EHScope::Catch:
+      size = EHCatchScope::getSizeForNumHandlers(
+          static_cast<const EHCatchScope *>(get())->getNumHandlers());
+      break;
+
+    case EHScope::Filter:
+      llvm_unreachable("EHScopeStack::iterator Filter");
+      break;
+
+    case EHScope::Cleanup:
+      llvm_unreachable("EHScopeStack::iterator Cleanup");
+      break;
+
+    case EHScope::Terminate:
+      llvm_unreachable("EHScopeStack::iterator Terminate");
+      break;
+    }
+    ptr += llvm::alignTo(size, ScopeStackAlignment);
+    return *this;
+  }
+
+  bool operator==(iterator other) const { return ptr == other.ptr; }
+  bool operator!=(iterator other) const { return ptr != other.ptr; }
 };
 
 inline EHScopeStack::iterator EHScopeStack::begin() const {
   return iterator(startOfData);
 }
 
+inline EHScopeStack::iterator EHScopeStack::end() const {
+  return iterator(endOfBuffer);
+}
+
 inline EHScopeStack::iterator
 EHScopeStack::find(stable_iterator savePoint) const {
   assert(savePoint.isValid() && "finding invalid savepoint");
@@ -254,7 +313,7 @@ inline void EHScopeStack::popCatch() {
   assert(!empty() && "popping exception stack when not empty");
 
   EHCatchScope &scope = llvm::cast<EHCatchScope>(*begin());
-  assert(!cir::MissingFeatures::innermostEHScope());
+  innermostEHScope = scope.getEnclosingEHScope();
   deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
 }
 
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 4198c23c9cbed..9005b0106b2a4 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -155,6 +155,9 @@ class EHScopeStack {
   /// The innermost normal cleanup on the stack.
   stable_iterator innermostNormalCleanup = stable_end();
 
+  /// The innermost EH scope on the stack.
+  stable_iterator innermostEHScope = stable_end();
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
@@ -226,6 +229,8 @@ class EHScopeStack {
   }
   stable_iterator getInnermostActiveNormalCleanup() const;
 
+  stable_iterator getInnermostEHScope() const { return innermostEHScope; }
+
   /// An unstable reference to a scope-stack depth.  Invalidated by
   /// pushes but not pops.
   class iterator;
@@ -233,6 +238,9 @@ class EHScopeStack {
   /// Returns an iterator pointing to the innermost EH scope.
   iterator begin() const;
 
+  /// Returns an iterator pointing to the outermost EH scope.
+  iterator end() const;
+
   /// Create a stable reference to the top of the EH stack.  The
   /// returned reference is valid until that scope is popped off the
   /// stack.

From 4f428d30e4d8287169fbc2acfcf37ca7b37ed539 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Wed, 5 Nov 2025 05:34:42 +0900
Subject: [PATCH 230/313] [clang][lit] Add SPIR-V to some OpenMP offload tests
 (#165775)

Just to get a little more test coverage.

Signed-off-by: Nick Sarnie <nick.sarnie@intel.com>
---
 clang/test/OpenMP/metadirective_ast_print.c   | 36 ++++++++++++----
 .../metadirective_device_arch_codegen.cpp     | 28 ++++++++-----
 clang/test/OpenMP/thread_limit_amdgpu.c       | 34 ---------------
 clang/test/OpenMP/thread_limit_gpu.c          | 41 +++++++++++++++++++
 4 files changed, 86 insertions(+), 53 deletions(-)
 delete mode 100644 clang/test/OpenMP/thread_limit_amdgpu.c
 create mode 100644 clang/test/OpenMP/thread_limit_gpu.c

diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c
index 638dbae1bc774..75ef5fa26827c 100644
--- a/clang/test/OpenMP/metadirective_ast_print.c
+++ b/clang/test/OpenMP/metadirective_ast_print.c
@@ -2,17 +2,25 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT
 
-// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
 
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -77,6 +85,12 @@ void foo1(void) {
   for (int i = 0; i < 100; i++)
   ;
 
+#pragma omp metadirective when(device={arch("spirv64")}: \
+                                teams distribute parallel for)\
+                                otherwise(parallel for)
+  for (int i = 0; i < 100; i++)
+  ;
+
 #pragma omp metadirective when(implementation = {extension(match_all)} \
                                : nothing) otherwise(parallel for)
   for (int i = 0; i < 16; i++)
@@ -134,8 +148,8 @@ void foo1(void) {
 // OMP52-NEXT: for (int i = 0; i < 16; i++) {
 // OMP52-NEXT: #pragma omp simd
 // OMP52-NEXT: for (int j = 0; j < 16; j++)
-// OMP52-AMDGCN: #pragma omp teams distribute parallel for
-// OMP52-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
+// OMP52-GPU: #pragma omp teams distribute parallel for
+// OMP52-GPU-NEXT: for (int i = 0; i < 100; i++)
 // OMP52: for (int i = 0; i < 16; i++)
 // OMP52: for (int i = 0; i < 16; i++)
 
@@ -198,6 +212,12 @@ void foo2(void) {
   for (int i = 0; i < 100; i++)
   ;
 
+#pragma omp metadirective when(device={arch("spirv64")}: \
+                                teams distribute parallel for)\
+                                default(parallel for)
+  for (int i = 0; i < 100; i++)
+  ;  
+
 #pragma omp metadirective when(implementation = {extension(match_all)} \
                                : nothing) default(parallel for)
   for (int i = 0; i < 16; i++)
@@ -266,8 +286,8 @@ void foo2(void) {
 // DEFAULT-NEXT: for (int i = 0; i < 16; i++) {
 // DEFAULT-NEXT: #pragma omp simd
 // DEFAULT-NEXT: for (int j = 0; j < 16; j++)
-// DEFAULT-AMDGCN: #pragma omp teams distribute parallel for
-// DEFAULT-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
+// DEFAULT-GPU: #pragma omp teams distribute parallel for
+// DEFAULT-GPU-NEXT: for (int i = 0; i < 100; i++)
 // DEFAULT: for (int i = 0; i < 16; i++)
 // DEFAULT: for (int i = 0; i < 16; i++)
 
diff --git a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
index eecae310d0a77..1d5584de67162 100644
--- a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
+++ b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
@@ -1,7 +1,7 @@
-// REQUIRES: amdgpu-registered-target
-
 // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -target-cpu gfx906 -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-ppc-spirv-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-spirv-host.bc  -o - | FileCheck %s
 // expected-no-diagnostics
 
 
@@ -16,6 +16,12 @@ Inspired from SOLLVE tests:
 
 #define N 1024
 
+#ifdef __AMDGPU__
+#define GPU "amdgcn"
+#else
+#define GPU "spirv64"
+#endif
+
 int metadirective1() {
 
    int v1[N], v2[N], v3[N];
@@ -26,7 +32,7 @@ int metadirective1() {
    #pragma omp target map(to:v1,v2) map(from:v3, target_device_num) device(default_device)
    {
       #pragma omp metadirective \
-                   when(device={arch("amdgcn")}: teams distribute parallel for) \
+                   when(device={arch(GPU)}: teams distribute parallel for) \
                    default(parallel for)
 
          for (int i = 0; i < N; i++) {
@@ -38,28 +44,28 @@ int metadirective1() {
    return errors;
 }
 
-// CHECK: define weak_odr protected amdgpu_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
 // CHECK: entry:
-// CHECK: %{{[0-9]}} = call i32 @__kmpc_target_init
+// CHECK: %{{[0-9]}} = call{{.*}} i32 @__kmpc_target_init
 // CHECK: user_code.entry:
-// CHECK: call void @[[METADIRECTIVE]]_omp_outlined
-// CHECK-NOT: call void @__kmpc_parallel_51
+// CHECK: call{{.*}} void @[[METADIRECTIVE]]_omp_outlined
+// CHECK-NOT: call{{.*}} void @__kmpc_parallel_51
 // CHECK: ret void
 
 
 // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined
 // CHECK: entry:
-// CHECK: call void @__kmpc_distribute_static_init
+// CHECK: call{{.*}} void @__kmpc_distribute_static_init
 // CHECK: omp.loop.exit:
-// CHECK: call void @__kmpc_distribute_static_fini
+// CHECK: call{{.*}} void @__kmpc_distribute_static_fini
 
 
 // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined_omp_outlined
 // CHECK: entry:
-// CHECK: call void @__kmpc_for_static_init_4
+// CHECK: call{{.*}} void @__kmpc_for_static_init_4
 // CHECK: omp.inner.for.body:
 // CHECK: store atomic {{.*}} monotonic
 // CHECK: omp.loop.exit:
-// CHECK-NEXT: call void @__kmpc_for_static_fini
+// CHECK-NEXT: call{{.*}} void @__kmpc_for_static_fini
 // CHECK-NEXT: ret void
 
diff --git a/clang/test/OpenMP/thread_limit_amdgpu.c b/clang/test/OpenMP/thread_limit_amdgpu.c
deleted file mode 100644
index f884eeb73c3ff..0000000000000
--- a/clang/test/OpenMP/thread_limit_amdgpu.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
-// expected-no-diagnostics
-
-#ifndef HEADER
-#define HEADER
-
-void foo(int N) {
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd thread_limit(4)
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
-  for (int i = 0; i < N; ++i)
-    ;
-}
-
-#endif
-
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l10({{.*}}) #[[ATTR1:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l13({{.*}}) #[[ATTR2:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l16({{.*}}) #[[ATTR3:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l19({{.*}}) #[[ATTR4:.+]] {
-
-// CHECK: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
-// CHECK: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
-// CHECK: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
-// CHECK: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c
new file mode 100644
index 0000000000000..4bcc14d070c22
--- /dev/null
+++ b/clang/test/OpenMP/thread_limit_gpu.c
@@ -0,0 +1,41 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo(int N) {
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
+  for (int i = 0; i < N; ++i)
+    ;
+}
+
+#endif
+
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] {
+
+// CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
+
+// CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4"  {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} }

From 2141edf506baab7e526f3a305bcdb6d6f2c772bc Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 4 Nov 2025 12:53:03 -0800
Subject: [PATCH 231/313] [lldb] Skip tests on older versions of clang

---
 .../TestLibcxxInternalsRecognizer.py                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
index d8a729b322fe4..eeb5d1b554b01 100644
--- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
@@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "21.0"])
     def test_frame_recognizer(self):
         """Test that implementation details of libc++ are hidden"""
         self.build()

From 9703bda95b088bb6a455ef9faffdb41c537aff2f Mon Sep 17 00:00:00 2001
From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:15:32 -0800
Subject: [PATCH 232/313] [mlir][xegpu] Add OptimizeBlockLoads pass.  (#165483)

This pass rewrites certain xegpu `CreateNd` and `LoadNd` operations that
feeds into `vector.transpose` to more optimal form to improve
performance. Specifically, low precision (bitwidth < 32) `LoadNd` ops
that feeds into transpose ops are rewritten to i32 loads with a valid
transpose layout such that later passes can use the load with transpose
HW feature to accelerate such load ops.

**Update:**
Pass is renamed to `OptimizeBlockLoads ` because later we plan to add
the array length optimization into this pass as well. This will break
down a larger load (like `32x32xf16`) into more DPAS-favorable array
length loads (`32x16xf16` with array length = 2). Both these
optmizations require rewriting `CreateNd` and `LoadNd` and it makes
sense to have a common pass for both.
---
 .../mlir/Dialect/XeGPU/Transforms/Passes.td   |  12 +
 .../Dialect/XeGPU/Transforms/Transforms.h     |   3 +-
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   9 +
 .../Dialect/XeGPU/Transforms/CMakeLists.txt   |   1 +
 .../Transforms/XeGPUOptimizeBlockLoads.cpp    | 490 ++++++++++++++++++
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  36 +-
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |  26 +
 .../Dialect/XeGPU/optimize-transpose.mlir     | 280 ++++++++++
 8 files changed, 827 insertions(+), 30 deletions(-)
 create mode 100644 mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
 create mode 100644 mlir/test/Dialect/XeGPU/optimize-transpose.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index eb05628d4772b..e42799689e490 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -85,4 +85,16 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> {
                            "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"];
 }
 
+def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
+  let summary = "Optimize XeGPU block load operations";
+  let description = [{
+    This pass rewrites XeGPU loadNd operations into more optimal forms
+    to improve performance. This includes,
+    - Rewriting transpose B loads into more optimal forms to use HW block
+      transpose instructions for better performance.
+  }];
+  let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
+                           "vector::VectorDialect"];
+}
+
 #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index a480195eebd00..1776a209d0bf1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -61,7 +61,8 @@ struct UnrollOptions {
 
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
-
+/// Appends patterns for optimizing block load operations into `patterns`.
+void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU SIMT distribution into `patterns`.
 void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
 /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 620a2fe43d682..58092c3bb9ed2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -166,6 +166,15 @@ SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc,
 SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
                                               ArrayRef<OpFoldResult> lhs,
                                               ArrayRef<OpFoldResult> rhs);
+
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
+/// array length).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+                      ArrayRef<T> candidateMultiples = {});
+
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index e6f76067094ce..29b645feab2c6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUWgToSgDistribute.cpp
   XeGPUPropagateLayout.cpp
   XeGPUVectorLinearize.cpp
+  XeGPUOptimizeBlockLoads.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
new file mode 100644
index 0000000000000..4dc5ea4f7bb24
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -0,0 +1,490 @@
+//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+#include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <optional>
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-optimize-block-loads"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+
+/// Get the 2D lane data from a tensor desc type if it exists.
+static std::optional<SmallVector<int64_t>>
+getMaybeLaneData(xegpu::TensorDescType tdescType) {
+  auto layout = tdescType.getLayoutAttr();
+  if (!layout)
+    return std::nullopt;
+  auto laneData = layout.getEffectiveLaneDataAsInt();
+  if (laneData.size() != 2)
+    return std::nullopt;
+  return laneData;
+}
+
+/// Get the 2D lane layout from a tensor desc type if it exists.
+static std::optional<SmallVector<int64_t>>
+getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
+  auto layout = tdescType.getLayoutAttr();
+  if (!layout)
+    return std::nullopt;
+  auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+  if (laneLayout.size() != 2)
+    return std::nullopt;
+  return laneLayout;
+}
+
+/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
+/// lane[1] == 1), but inner lane data is not equal to [1, 1].
+/// Example:
+///     !xegpu.tensor_desc<16x16xf16,
+///         #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
+/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
+/// indicating that this is a load that requires transpose effect. However,
+/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
+/// the inner dimension. We convert this to a optimized form by converting the
+/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
+/// later lowering easily use the load with transpose instruction.
+static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
+                                       ArrayRef<int64_t> laneData) {
+  if (laneLayout.size() != 2 || laneData.size() != 2)
+    return false;
+  if (laneLayout[0] == 1 || laneLayout[1] != 1)
+    return false;
+  if (laneData[0] != 1 || laneData[1] == 1)
+    return false;
+  return true;
+}
+
+/// A tensor desc type can be optimized if its element type is less than 32 bits
+/// and its layout can be optimized.
+static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
+  // If the dtype is greater or equal to 32 bits, layout must be valid.
+  int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
+  if (elementTyBitwidth >= 32)
+    return false;
+  auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
+  auto maybeLaneData = getMaybeLaneData(tdescType);
+  if (!maybeLaneData || !maybeLaneLayout)
+    return false;
+  return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
+}
+
+/// Check if a tensor desc type can be optimized for transpose, if so return the
+/// new optimized tensor desc type with a valid transpose layout.
+static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
+                                         const uArch *targetuArch) {
+  if (!canBeOptimizedForTranspose(tdescType))
+    return tdescType;
+  auto laneData = getMaybeLaneData(tdescType)
+                      .value(); // Lane data must exist if we reach here.
+  int64_t innerLaneData = laneData[1];
+  int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
+  // Required shape is total shape of the vector result that this tensor desc
+  // must eventually load after adjusting for the new bitwidth and array
+  // length.
+  SmallVector<int64_t> requiredShape(tdescType.getShape());
+  requiredShape.back() =
+      requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
+  int newBitWidth = elementTyBitwidth * innerLaneData;
+  Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
+  // Supported shape is the max transpose shape that can be supported by
+  // hardware that is less than or equal to required shape.
+  auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
+      targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
+  auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
+      newElemTy, /** has transform */ false, /** has transpose */ true);
+  // If no HW params found, return the original type.
+  if (!maybeHWParams)
+    return tdescType;
+  auto [widths, heights, counts] = maybeHWParams.value();
+  // TODO: Currently we expect array length to be 1 for transpose case.
+  if (counts.size() != 1 || counts[0] != 1)
+    return tdescType;
+  int arrayLen = counts[0];
+  int supportedHeight =
+      xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
+  int supportedWidth =
+      xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
+  // If no supported height or width found, return the original type.
+  if (supportedHeight == -1 || supportedWidth == -1)
+    return tdescType;
+
+  SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
+  xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
+      tdescType.getContext(),
+      tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
+  // Array length can not be larger than 1 for transpose case.
+  return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
+                                    tdescType.getBoundaryCheck(),
+                                    tdescType.getMemorySpace(), newLayout);
+}
+
+/// Helper to convert an OpFoldResult to Value.
+static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
+                            OpFoldResult ofr) {
+  std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
+  if (mayBeInt)
+    return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
+  return llvm::cast<Value>(ofr);
+}
+
+/// Helper to divide a Value by a constant integer.
+static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
+                              Value val, int64_t constant) {
+  // If the constant is a power of 2, use right shift for division.
+  if (llvm::isPowerOf2_64(constant)) {
+    int64_t shiftAmount = llvm::Log2_64(constant);
+    return arith::ShRUIOp::create(
+               rewriter, loc, val,
+               arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
+                   .getResult())
+        .getResult();
+  }
+  auto constantOp =
+      arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
+  return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
+}
+
+/// This function takes a larger register block `data` and generates multiple
+/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
+/// starting from `offsets`.
+static Value generateLoads(ConversionPatternRewriter &rewriter,
+                           TypedValue<VectorType> data,
+                           SmallVector<OpFoldResult> offsets,
+                           TypedValue<xegpu::TensorDescType> newTensorDesc,
+                           xegpu::LoadNdOp origLoadOp) {
+  Location loc = data.getLoc();
+  assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
+  Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
+  Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
+  SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
+  // Compute the ratio between original shape and supported shape. We need to
+  // generate loads in this ratio arrangement.
+  auto shapeRatio = computeShapeRatio(data.getType().getShape(),
+                                      supportedShape)
+                        .value(); // `ratio` must be defined if we reach here.
+  for (int64_t h = 0; h < shapeRatio[0]; ++h) {
+    for (int64_t w = 0; w < shapeRatio[1]; ++w) {
+      int64_t localOffsetDim0 = h * supportedShape[0];
+      int64_t localOffsetDim1 = w * supportedShape[1];
+      Value loadOffsetX = arith::AddIOp::create(
+          rewriter, loc, offsetDim0,
+          arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
+              .getResult());
+      Value loadOffsetY = arith::AddIOp::create(
+          rewriter, loc, offsetDim1,
+          arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
+              .getResult());
+      auto loadOp = xegpu::LoadNdOp::create(
+          rewriter, loc,
+          VectorType::get(supportedShape, data.getType().getElementType()),
+          newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
+          origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
+          origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
+          origLoadOp.getL3HintAttr());
+      // Set the layout for the loadOp.
+      auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
+      xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
+      // Insert the loaded block into the right position in data.
+      auto insertOp = vector::InsertStridedSliceOp::create(
+          rewriter, loc, loadOp.getResult(), data,
+          ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
+          ArrayRef<int64_t>{1, 1});
+      // InsertOp must have the same layout as newTensorDesc.
+      xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
+      data = insertOp.getResult();
+    }
+  }
+  return data;
+}
+
+/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
+/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
+/// the base pointer from the original memory source and adjusting the shape and
+/// strides of the tensor desc to fit with the new optimized transpose layout.
+class XeGPUCreateNdDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateNdDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto tdescTy = createNdOp.getType();
+    // Get the target uArch info.
+    auto chipStr = xegpu::getChipStr(createNdOp);
+    // Check if the chip is supported.
+    assert(
+        chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
+        "Expecting target chip to be pvc or bmg for transpose optimization.");
+    const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
+
+    auto convertType = tryOptimize(tdescTy, targetuArch);
+    if (convertType == tdescTy)
+      return failure();
+    auto strides = createNdOp.getMixedStrides();
+    auto maybeConstInnerStride = getConstantIntValue(strides.back());
+    // Only row-major memrefs are expected for now.
+    if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
+      return rewriter.notifyMatchFailure(
+          createNdOp, "Expecting row-major memref for transpose optimization.");
+    Value source = createNdOp.getSource();
+    auto optionalLaneData = getMaybeLaneData(tdescTy);
+    assert(optionalLaneData && "Expected 2D lane data");
+    auto laneData = optionalLaneData.value();
+    int64_t innerLaneData = laneData[1];
+    auto memrefType = dyn_cast<MemRefType>(source.getType());
+    // Inner dimension of the shape must be adjusted based on innerLaneData.
+    SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
+    modifiedShape.back() = divideByConstant(
+        rewriter, createNdOp.getLoc(),
+        convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
+        innerLaneData);
+    // Similarly, second to last stride must be adjusted.
+    assert(strides.size() >= 2 &&
+           "Expected at least 2 strides for CreateNdDescOp");
+    SmallVector<OpFoldResult> modifiedStrides(strides);
+    modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
+        rewriter, createNdOp.getLoc(),
+        convertToValue(rewriter, createNdOp.getLoc(),
+                       modifiedStrides[modifiedStrides.size() - 2]),
+        innerLaneData);
+
+    // If the source is a static memref, we need to extract the pointer to
+    // base address.
+    if (memrefType && memrefType.hasStaticShape()) {
+      auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
+          rewriter, createNdOp.getLoc(), source);
+      source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
+                                          rewriter.getI64Type(),
+                                          extractOp.getResult())
+                   .getResult();
+    }
+    // Create a new CreateNdDescOp with the modified shape and converted type.
+    auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
+        rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
+        modifiedStrides);
+    rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
+    return success();
+  }
+};
+
+/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
+/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
+/// adjusted tensor desc type. This can result in multiple LoadNdOps being
+/// generated to fill in the original load shape.
+class XeGPULoadNdDescOpPattern final
+    : public OpConversionPattern<xegpu::LoadNdOp> {
+public:
+  using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto origTensorDescType = loadNdOp.getTensorDescType();
+    auto adaptorType =
+        cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
+    if (adaptorType == origTensorDescType)
+      return failure();
+    // Offsets must be adjusted based on innerLaneData.
+    auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
+    int64_t innerLaneData = laneData[1];
+    auto offsets = loadNdOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(loadNdOp,
+                                         "Expecting offsets in LoadNd");
+    SmallVector<OpFoldResult> modifiedOffsets(offsets);
+    modifiedOffsets.back() = divideByConstant(
+        rewriter, loadNdOp.getLoc(),
+        convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
+        innerLaneData);
+    // Get the 2D data shape of this loadNdOp in its original type including
+    // array length.
+    SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
+    // Adjust the data shape based on innerLaneData.
+    origDataShape.back() /= innerLaneData;
+    // HW supported shape is the new tensor desc shape after conversion.
+    SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
+    VectorType origVectorType =
+        VectorType::get(origDataShape, adaptorType.getElementType());
+    Value data;
+    // Orig data shape is 3D for the array length case.
+    if (origTensorDescType.getArrayLength() > 1) {
+      SmallVector<Value> arraySlices;
+      for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
+        Value slice = arith::ConstantOp::create(
+            rewriter, loadNdOp->getLoc(), origVectorType,
+            rewriter.getZeroAttr(origVectorType));
+        // Increase the Y offset for each array slice.
+        Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
+                                       modifiedOffsets.back());
+        modifiedOffsets.back() =
+            arith::AddIOp::create(
+                rewriter, loadNdOp->getLoc(), offsetY,
+                arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
+                                               i * origDataShape[1])
+                    .getResult())
+                .getResult();
+        slice = generateLoads(
+            rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
+            cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
+            loadNdOp);
+        // BitCast back to original load shape without array length.
+        auto bitcastType = VectorType::get(origTensorDescType.getShape(),
+                                           origTensorDescType.getElementType());
+        auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
+                                                   bitcastType, slice);
+        // BitCastOp must have the same layout as the original loadNdOp.
+        xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                       origTensorDescType.getLayoutAttr());
+        arraySlices.push_back(bitCastOp.getResult());
+      }
+      rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
+      return success();
+    }
+    data = arith::ConstantOp::create(
+        rewriter, loadNdOp->getLoc(),
+        VectorType::get(origDataShape, adaptorType.getElementType()),
+        rewriter.getZeroAttr(origVectorType));
+    data = generateLoads(
+        rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
+        cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
+        loadNdOp);
+    auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
+                                               loadNdOp.getType(), data);
+    // BitCastOp must have the same layout as the original loadNdOp.
+    xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                   origTensorDescType.getLayoutAttr());
+    rewriter.replaceOp(loadNdOp, bitCastOp);
+    return success();
+  }
+};
+
+/// Vector ExtractOp must be processed if the original tensor desc type has
+/// array length greater than 1. In this case, the LoadNdOp is replaced with
+/// multiple LoadNdOps for each array slice making the extraction unnecessary.
+/// In this case, we simply remove the ExtractOp.
+class VectorExtractOpPattern final
+    : public OpConversionPattern<vector::ExtractOp> {
+public:
+  using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Check if the source of the extraction is split to multiple values.
+    if (adaptor.getSource().size() == 1)
+      return failure();
+    auto mixedPos = extractOp.getMixedPosition();
+    if (mixedPos.size() != 1)
+      return failure();
+    auto mayBeInt = getConstantIntValue(mixedPos[0]);
+    if (!mayBeInt)
+      return failure();
+    rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
+    return success();
+  }
+};
+
+} // namespace
+
+void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
+               VectorExtractOpPattern>(patterns.getContext());
+}
+
+namespace {
+
+struct XeGPUOptimizeBlockLoadsPass final
+    : public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
+          XeGPUOptimizeBlockLoadsPass> {
+  void runOnOperation() override {
+    MLIRContext &context = getContext();
+    TypeConverter converter;
+    RewritePatternSet patterns(&context);
+    ConversionTarget target(context);
+
+    // This pass is only meant for PVC and BMG targets. If unsupported target
+    // is found, exit early.
+    bool isTargetSupported = false;
+    getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
+      auto chipStr = xegpu::getChipStr(funcOp);
+      if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
+        isTargetSupported = true;
+    });
+
+    if (!isTargetSupported) {
+      DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
+             << "\n";
+      return;
+    }
+
+    // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
+    // converted.
+    target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+        [&](xegpu::CreateNdDescOp createNdOp) {
+          return !canBeOptimizedForTranspose(createNdOp.getType());
+        });
+    target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
+        [&](xegpu::LoadNdOp loadNdOp) {
+          return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
+        });
+    // Vector ExtractOps can have optimizable layouts if they extract from
+    // LoadNdOps with array length greater than 1. These ExtractOps must be
+    // converted.
+    target.addDynamicallyLegalOp<vector::ExtractOp>(
+        [&](vector::ExtractOp extractOp) {
+          auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
+          if (!layout)
+            return true;
+          auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+          auto laneData = layout.getEffectiveLaneDataAsInt();
+          return !canBeOptimizedForTranspose(laneLayout, laneData);
+        });
+    converter.addConversion([](Type type) { return type; });
+
+    target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
+                           vector::VectorDialect>();
+    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                         target);
+    xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      DBGS() << "Optimize block loads pass failed.\n";
+      return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 14c49e7f45706..4e1a539771d2f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -204,28 +204,6 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
   using Lattice::Lattice;
 };
 
-/// Helper Function to find a proper instruction multiple for the user-supplied
-/// sg-level data shape. `candidates` are uArch allowed shapes.
-/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count).
-template <typename T>
-int getLargestDivisor(T dim, ArrayRef<T> candidates,
-                      ArrayRef<T> candidateMultiples = {}) {
-  static_assert(std::is_integral<T>::value, "T must be an integer type");
-  int largest = -1;
-  SmallVector<T> multiples = {1};
-  if (!candidateMultiples.empty())
-    multiples =
-        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
-  for (T candidate : candidates) {
-    for (T multiple : multiples) {
-      int value = static_cast<int>(candidate * multiple);
-      if (value != 0 && dim % value == 0 && value > largest)
-        largest = value;
-    }
-  }
-  return largest;
-}
-
 /// Helper Functions to get default layouts. A `default layout` is a layout that
 /// is assigned to a value when the layout is not fixed by some anchor operation
 /// (like DPAS).
@@ -505,7 +483,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
     prefetch.emitWarning("No known block params found for the element type.");
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
-  int instWidth = getLargestDivisor(
+  int instWidth = xegpu::getLargestDivisor(
       static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
       bCount);
   if (instWidth == -1)
@@ -514,7 +492,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   if (tdescTy.getRank() == 1)
     instData = {instWidth};
   else {
-    int instHeight = getLargestDivisor(
+    int instHeight = xegpu::getLargestDivisor(
         static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
     if (instHeight == -1)
       prefetch.emitWarning(
@@ -634,7 +612,7 @@ void LayoutInfoPropagation::visitDpasOp(
   const unsigned dataALen = aTy.getShape().front();
   auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
   const int maxALen =
-      getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+      xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
   if (maxALen == -1)
     dpas.emitWarning(
         "No suitable instruction multiple found for the given shape.");
@@ -642,7 +620,7 @@ void LayoutInfoPropagation::visitDpasOp(
   const unsigned dataBLen = bTy.getShape().back();
   auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
   const int maxBLen =
-      getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+      xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
   if (maxBLen == -1)
     dpas.emitWarning(
         "No suitable instruction multiple found for the given shape.");
@@ -662,7 +640,7 @@ void LayoutInfoPropagation::visitDpasOp(
     const unsigned dataCLen = bTy.getShape().back();
     auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
     const int maxCLen =
-        getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+        xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
     if (maxCLen == -1)
       dpas.emitWarning(
           "No suitable instruction multiple found for the given shape.");
@@ -691,7 +669,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
     store.emitWarning("No known block params found for the element type.");
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
-  int instWidth = getLargestDivisor(
+  int instWidth = xegpu::getLargestDivisor(
       static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
       bCount);
   if (instWidth == -1)
@@ -700,7 +678,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
   if (dataTy.getRank() == 1)
     instData = {instWidth};
   else {
-    int instHeight = getLargestDivisor(
+    int instHeight = xegpu::getLargestDivisor(
         static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
     if (instHeight == -1)
       store.emitWarning(
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index d575a415a3035..de9e09d427665 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -555,3 +555,29 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
   results.append(addElementwise(builder, loc, a, b));
   return results;
 }
+
+template <typename T>
+int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates,
+                             ArrayRef<T> candidateMultiples) {
+  static_assert(std::is_integral<T>::value, "T must be an integer type");
+  int largest = -1;
+  SmallVector<T> multiples = {1};
+  if (!candidateMultiples.empty())
+    multiples =
+        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+  for (T candidate : candidates) {
+    for (T multiple : multiples) {
+      int value = static_cast<int>(candidate * multiple);
+      if (value != 0 && dim % value == 0 && value > largest)
+        largest = value;
+    }
+  }
+  return largest;
+}
+
+/// Explicit instantiations
+template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
+                                           ArrayRef<int> candidateMultiples);
+template int
+xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
+                                   ArrayRef<unsigned> candidateMultiples);
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
new file mode 100644
index 0000000000000..24a0de6ed48a5
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
@@ -0,0 +1,280 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc'  \
+// RUN:   --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @no_scf(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> {
+// CHECK:         %[[C16:.*]] = arith.constant 16 : index
+// CHECK:         %[[C32:.*]] = arith.constant 32 : index
+// CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64
+// CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK-NEXT:    %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK:         %[[BITCAST:.*]] = vector.bitcast %[[B]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+  %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+  %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+  %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  gpu.return %6 : vector<8x16xf32>
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @no_scf_i8(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> {
+// CHECK:         %[[C16:.*]] = arith.constant 16 : index
+// CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64
+// CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:         %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK:         %[[T3:.*]] = vector.bitcast %[[T2]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>
+#c = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+gpu.module @xevm_module {
+gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> {
+  %c0 = arith.constant 0 : index
+  %c64 = arith.constant 64 : index
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b>
+  %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8>
+  %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8>
+  %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
+  gpu.return %6 : vector<8x16xi32>
+}
+}
+
+
+// -----
+// CHECK-LABEL:   gpu.func @gemm_b_transpose(
+// CHECK-SAME:      %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C16:.*]] = arith.constant 16 : index
+// CHECK:           %[[C256:.*]] = arith.constant 256 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1]
+// CHECK-SAME:        : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
+// CHECK:             %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK-NEXT:        %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK-NEXT:        %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+  %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
+    %5 = xegpu.load_nd %2[%c0, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+    %7 = vector.transpose %6, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %8 : vector<8x16xf32>
+  } {layout_result_0 = #a}
+  xegpu.store_nd %4, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @nested_scf(
+// CHECK-SAME:     %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:          %[[C128:.*]] = arith.constant 128 : index
+// CHECK:          %[[C1:.*]] = arith.constant 1 : index
+// CHECK:          %[[C16:.*]] = arith.constant 16 : index
+// CHECK:          %[[C256:.*]] = arith.constant 256 : index
+// CHECK:          scf.for %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:            %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:            %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:            %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
+// CHECK-SAME:          -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:            %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
+// CHECK:              %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK-NEXT:         %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]  {layout_result_0 = #xegpu.layout<
+// CHECK-SAME:          lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK-NEXT:         %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c256 = arith.constant 256 : index
+  scf.for %arg8 = %c0 to %c256 step %c16 {
+    %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+    %1 = xegpu.load_nd %0[%arg8, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+    %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
+    %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+    %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
+      %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
+      %6 = xegpu.load_nd %3[%arg8, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+      %7 = vector.transpose %6, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+      %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      scf.yield %8 : vector<8x16xf32>
+    } {layout_result_0 = #a}
+    xegpu.store_nd %4, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  }
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @large_loads(
+// CHECK-SAME:      %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32>
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
+// CHECK-SAME:        -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]}
+// CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
+// CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]}
+// CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
+// CHECK:             %{{.*}} = vector.bitcast %[[T10]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x16xi32> to vector<32x32xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+  %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
+    -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+    %7 = vector.extract_strided_slice %6 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %8 = vector.extract_strided_slice %6 {offsets = [0, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %9 = vector.extract_strided_slice %6 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %10 = vector.extract_strided_slice %6 {offsets = [16, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %11 = vector.transpose %7, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+  } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
+  xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#1, %0[%c0, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#2, %0[%c16, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#3, %0[%c16, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:  gpu.func @array_length(
+// CHECK-SAME:      %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 ->
+// CHECK-SAME:        !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
+// CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16>
+    -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
+    -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b }
+      : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16>
+    %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
+    %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
+    %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %9 = vector.extract_strided_slice %20 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %10 = vector.extract_strided_slice %20 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %11 = vector.transpose %7, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+  } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
+  xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#1, %0[%c0, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#2, %0[%c16, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#3, %0[%c16, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}

From 6e2dcdb17c0ef4eff239a08772b194e2009193ec Mon Sep 17 00:00:00 2001
From: Kaitlin Peng <kaitlinpeng@microsoft.com>
Date: Tue, 4 Nov 2025 13:20:46 -0800
Subject: [PATCH 233/313] Fix implicit truncation of `select` non-bool vector
 conditions (#166279)

Fixes #164018.

The problem is that we're unable to do an implicit conversion sequence
on a template deduced argument, so the current vector templates can't
reconcile `vector<int, 4>` with `vector<bool, Sz>`. This PR separates
the vector templates into size-specific ones, getting rid of the `Sz`
deduction and allowing for the implicit conversion to be done.
---
 .../lib/Headers/hlsl/hlsl_alias_intrinsics.h  | 52 ++++++++++++---
 clang/test/CodeGenHLSL/builtins/select.hlsl   | 64 ++++++++++++++-----
 .../test/SemaHLSL/BuiltIns/select-errors.hlsl |  2 +-
 3 files changed, 93 insertions(+), 25 deletions(-)

diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index 4c5861c2c5f9d..208776eb7840e 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -2111,9 +2111,17 @@ T select(bool, T, T);
 /// \param FalseVals The vector values are chosen from when conditions are
 /// false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 2> select(vector<bool, 2>, vector<T, 2>, vector<T, 2>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, vector<T, 3>, vector<T, 3>);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>);
+vector<T, 4> select(vector<bool, 4>, vector<T, 4>, vector<T, 4>);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, T TrueVal,
 ///                         vector<T,Sz> FalseVals)
@@ -2123,9 +2131,17 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>);
 /// \param FalseVals The vector values are chosen from when conditions are
 /// false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 2> select(vector<bool, 2>, T, vector<T, 2>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, T, vector<T, 3>);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>);
+vector<T, 4> select(vector<bool, 4>, T, vector<T, 4>);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals,
 ///                         T FalseVal)
@@ -2134,9 +2150,17 @@ vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>);
 /// \param TrueVals The vector values are chosen from when conditions are true.
 /// \param FalseVal The scalar value to splat from when conditions are false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 2> select(vector<bool, 2>, vector<T, 2>, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, vector<T, 3>, T);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T);
+vector<T, 4> select(vector<bool, 4>, vector<T, 4>, T);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals,
 ///                         T FalseVal)
@@ -2145,10 +2169,20 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T);
 /// \param TrueVal The scalar value to splat from when conditions are true.
 /// \param FalseVal The scalar value to splat from when conditions are false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 2>> select(
+    vector<bool, 2>, T, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 3>> select(
+    vector<bool, 3>, T, T);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, Sz>> select(
-    vector<bool, Sz>, T, T);
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 4>> select(
+    vector<bool, 4>, T, T);
 
 //===----------------------------------------------------------------------===//
 // sin builtins
diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl
index 7590b4a881259..e5169844cb3f2 100644
--- a/clang/test/CodeGenHLSL/builtins/select.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/select.hlsl
@@ -20,16 +20,6 @@ struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) {
   return select(cond0, tVal, fVal);
 }
 
-// CHECK-LABEL: test_select_infer_array
-// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
-// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
-// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]]
-// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4
-// CHECK: ret void
-int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] {
-  return select(cond, tVal, fVal);
-}
-
 // CHECK-LABEL: test_select_bool_vector
 // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
@@ -38,24 +28,24 @@ int2 test_select_bool_vector(bool cond0, int2 tVal, int2 fVal) {
 }
 
 // CHECK-LABEL: test_select_vector_1
-// CHECK: [[SELECT:%.*]] = select <1 x i1> {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
+// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
 // CHECK: ret <1 x i32> [[SELECT]]
 int1 test_select_vector_1(bool1 cond0, int1 tVals, int1 fVals) {
-  return select<int,1>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_2
 // CHECK: [[SELECT:%.*]] = select <2 x i1> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
 int2 test_select_vector_2(bool2 cond0, int2 tVals, int2 fVals) {
-  return select<int,2>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_3
 // CHECK: [[SELECT:%.*]] = select <3 x i1> {{%.*}}, <3 x i32> {{%.*}}, <3 x i32> {{%.*}}
 // CHECK: ret <3 x i32> [[SELECT]]
 int3 test_select_vector_3(bool3 cond0, int3 tVals, int3 fVals) {
-  return select<int,3>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_4
@@ -86,10 +76,54 @@ int4 test_select_vector_vector_scalar(bool4 cond0, int4 tVals, int fVal) {
 // CHECK-LABEL: test_select_vector_scalar_scalar
 // CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
 // CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 %3, i64 0
+// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
 // CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[SELECT:%.*]] = select <4 x i1> {{%.*}}, <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 // CHECK: ret <4 x i32> [[SELECT]]
 int4 test_select_vector_scalar_scalar(bool4 cond0, int tVal, int fVal) {
   return select(cond0, tVal, fVal);
 }
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_4
+// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i1> {{%.*}}, <4 x i1> {{%.*}}
+// CHECK: ret <4 x i1> [[SELECT]]
+bool4 test_select_nonbool_cond_vector_4(int4 cond0, bool4 tVal, bool4 fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_vector
+// CHECK: [[TMP0:%.*]] = load <3 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <3 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <3 x i32> [[SPLAT_SRC1]], <3 x i32> poison, <3 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <3 x i1> [[TOBOOL]], <3 x i32> [[SPLAT1]], <3 x i32> {{%.*}}
+// CHECK: ret <3 x i32> [[SELECT]]
+int3 test_select_nonbool_cond_vector_scalar_vector(int3 cond0, int tVal, int3 fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_vector_scalar
+// CHECK: [[TMP0:%.*]] = load <2 x i32>, ptr %cond0.addr, align 8
+// CHECK: [[TOBOOL:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <2 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <2 x i32> [[SPLAT_SRC1]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <2 x i1> [[TOBOOL]], <2 x i32> {{%.*}}, <2 x i32> [[SPLAT1]]
+// CHECK: ret <2 x i32> [[SELECT]]
+int2 test_select_nonbool_cond_vector_vector_scalar(int2 cond0, int2 tVal, int fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_scalar
+// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+// CHECK: ret <4 x i32> [[SELECT]]
+int4 test_select_nonbool_cond_vector_scalar_scalar(int4 cond0, int tVal, int fVal) {
+  return select(cond0, tVal, fVal);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
index 12c818acec035..b2f45051a9bd8 100644
--- a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
@@ -15,7 +15,7 @@ int2 test_select_vector_vals_not_vecs(bool2 p0, int t0,
 }
 
 int1 test_select_vector_vals_wrong_size(bool2 p0, int1 t0, int1 f0) {
-  return select<int,1>(p0, t0, f0); // expected-warning{{implicit conversion truncates vector: 'bool2' (aka 'vector<bool, 2>') to 'vector<bool, 1>' (vector of 1 'bool' value)}}
+  return select<int1>(p0, t0, f0); // No diagnostic expected.
 }
 
 int test_select_no_args() {

From 1c85981e3770bfcc8f4c11417be0cfa7de543a15 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 4 Nov 2025 13:21:04 -0800
Subject: [PATCH 234/313] [llvm][mustache] Avoid redundant saves in accessor
 splitting (#159197)

The splitMustacheString function was saving StringRefs that
were already backed by an arena-allocated string. This was
unnecessary work. This change removes the redundant
Ctx.Saver.save() call.

This optimization provides a small but measurable performance
improvement on top of the single-pass tokenizer, most notably
reducing branch misses.

  Metric         | Baseline | Optimized | Change
  -------------- | -------- | --------- | -------
  Time (ms)      | 35.77    | 35.57     | -0.56%
  Cycles         | 35.16M   | 34.91M    | -0.71%
  Instructions   | 85.77M   | 85.54M    | -0.27%
  Branch Misses  | 113.9K   | 111.9K    | -1.76%
  Cache Misses   | 237.7K   | 242.1K    | +1.85%
---
 llvm/lib/Support/Mustache.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 24e3105c5e8a9..012e1ffd534d2 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -51,7 +51,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
       std::tie(Part, Str) = Str.split('.');
       // Each part of the accessor needs to be saved to the arena
       // to ensure it has a stable address.
-      Tokens.push_back(Ctx.Saver.save(Part.trim()));
+      Tokens.push_back(Part.trim());
     }
   }
   // Now, allocate memory for the array of StringRefs in the arena.

From ce091da5df6c095585b9cda48843f4a0a4952b79 Mon Sep 17 00:00:00 2001
From: Syadus Sefat <42645939+mssefat@users.noreply.github.com>
Date: Tue, 4 Nov 2025 15:37:27 -0600
Subject: [PATCH 235/313] [AMDGPU] Mark WMMA machine instructions as convergent
 (#165602)

The WMMA MI(s) are missing the isConvergent flag. This causes incorrect
behavior in passes like machine-sink, where WMMA instructions get sunk
into divergent branches.

This patch fixes the issue by setting the isConvergent flag to 1 in the
VOP3PInstructions.td file.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   | 10 +++--
 .../CodeGen/AMDGPU/wmma-gfx12-convergent.mir  | 42 +++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4ae2c1ed04dae..31d8bce4d0c87 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
   defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
       def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
     let mayRaiseFPException = 0;
     let ReadsModeReg = 0;
     let AsmMatchConverter = "cvtSWMMAC";
-
+    let isConvergent = 1;
     let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
   }
 }
@@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : WMMAInstGFX12<"v_wmma_scale_f32_32x16
 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
 } // End is_wmma_xdl = 1.
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+  defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+  defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
 } // End SubtargetPredicate = isGFX125xOnly
 } // End WaveSizePredicate = isWave32
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
new file mode 100644
index 0000000000000..df3e780c61f46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
+
+---
+name:            wmma_test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %0:vreg_128 = IMPLICIT_DEF
+    %1:vreg_128 = IMPLICIT_DEF
+    %2:sreg_32 = IMPLICIT_DEF
+    early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec
+    %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %5:vreg_256 = COPY %3.sub1:vreg_256
+
+  bb.2:
+    SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+
+...

From 8cd22447af239206daabb42fc63e2824a1f7fb6e Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Tue, 4 Nov 2025 13:40:42 -0800
Subject: [PATCH 236/313] AArch64: correct `preserve_most` and `preserve_all`
 on Windows (#166436)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes register information handling for the `preserve_most` and
`preserve_all` calling conventions on Windows ARM64.

The root issue was cascading `if` statements whose behavior depended on
their order. This patch makes the minimal, tactical change needed for
Swift’s two calling conventions, unblocking current work. A broader
refactor to remove the ordering dependency is still desired and will
follow in a subsequent PR.
---
 .../AArch64/AArch64CallingConvention.td       |  6 +++
 .../Target/AArch64/AArch64RegisterInfo.cpp    | 14 +++++--
 llvm/test/CodeGen/AArch64/preserve_mostcc.ll  | 38 ++++++++++++-------
 3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 1b5a713bffdc9..34c85d588f9c4 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError
 def CSR_Win_AArch64_AAPCS_SwiftTail
     : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>;
 
+def CSR_Win_AArch64_RT_MostRegs
+    : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>;
+
+def CSR_Win_AArch64_RT_AllRegs
+    : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>;
+
 // The Control Flow Guard check call uses a custom calling convention that also
 // preserves X0-X8 and Q0-Q7.
 def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5bfb19d9a7e61..a5048b9c9e61d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
     return getDarwinCalleeSavedRegs(MF);
 
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+    return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+               ? CSR_Win_AArch64_RT_MostRegs_SaveList
+               : CSR_AArch64_RT_MostRegs_SaveList;
+
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
+    return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+               ? CSR_Win_AArch64_RT_AllRegs_SaveList
+               : CSR_AArch64_RT_AllRegs_SaveList;
+
   if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
     return CSR_Win_AArch64_CFGuard_Check_SaveList;
   if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) {
@@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AAPCS_SwiftError_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail)
     return CSR_AArch64_AAPCS_SwiftTail_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
-    return CSR_AArch64_RT_MostRegs_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
-    return CSR_AArch64_RT_AllRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::Win64)
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
index 7f0968c8eb339..75c8567e2095e 100644
--- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s
+; RUN: llc < %s -mtriple=aarch64-unknown-windiws-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
 
 declare void @standard_cc_func()
 declare preserve_mostcc void @preserve_mostcc_func()
@@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func()
 define preserve_mostcc void @preserve_mostcc1() nounwind {
 entry:
 ;CHECK-LABEL: preserve_mostcc1
-;CHECK-NOT:   stp
-;CHECK-NOT:   str
-;CHECK:       str     x15
-;CHECK-NEXT:  stp     x14, x13,
-;CHECK-NEXT:  stp     x12, x11,
-;CHECK-NEXT:  stp     x10, x9,
-;CHECK:       bl      _standard_cc_func
+;CHECK-DARWIN-NOT:   stp
+;CHECK-DARWIN-NOT:   str
+;CHECK-DARWIN:       str     x15
+;CHECK-DARWIN-NEXT:  stp     x14, x13,
+;CHECK-DARWIN-NEXT:  stp     x12, x11,
+;CHECK-DARWIN-NEXT:  stp     x10, x9,
+;CHECK-WIN:       stp     x15, x14
+;CHECK-WIN-NEXT:  stp     x13, x12,
+;CHECK-WIN-NEXT:  stp     x11, x10,
+;CHECK-WIN-NEXT:  stp     x9, x30
+;CHECK:       bl      {{_?}}standard_cc_func
   call void @standard_cc_func()
-;CHECK:       ldp     x10, x9,
-;CHECK-NEXT:  ldp     x12, x11,
-;CHECK-NEXT:  ldp     x14, x13,
-;CHECK-NEXT:  ldr     x15
+;CHECK-DARWIN:       ldp     x10, x9,
+;CHECK-DARWIN-NEXT:  ldp     x12, x11,
+;CHECK-DARWIN-NEXT:  ldp     x14, x13,
+;CHECK-DARWIN-NEXT:  ldr     x15
+;CHECK-WIN:       ldp     x9, x30
+;CHECK-WIN-NEXT:  ldp     x11, x10,
+;CHECK-WIN-NEXT:  ldp     x13, x12,
+;CHECK-WIN-NEXT:  ldp     x15, x14,
   ret void
 }
 
@@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind {
 entry:
 ;CHECK-LABEL: preserve_mostcc2
 ;CHECK-NOT: x14
-;CHECK:     stp     x29, x30,
+;CHECK-DARWIN:     stp     x29, x30,
+;CHECK-WIN:     str     x30
 ;CHECK-NOT: x14
-;CHECK:     bl      _preserve_mostcc_func
+;CHECK:     bl      {{_?}}preserve_mostcc_func
   call preserve_mostcc void @preserve_mostcc_func()
   ret void
 }

From 025e431e7450cada2724b19eb59354a6c020fa4f Mon Sep 17 00:00:00 2001
From: Alireza Torabian <alireza.torabian@huawei.com>
Date: Tue, 4 Nov 2025 16:48:39 -0500
Subject: [PATCH 237/313] [LoopFusion] Forget loop and block dispositions after
 latch merge (#166233)

Merging the latches of loops may affect the dispositions, so they should
be forgotten after the merge. This patch fixed the crash in loop fusion
[#164082](https://github.com/llvm/llvm-project/issues/164082).
---
 llvm/lib/Transforms/Scalar/LoopFuse.cpp     | 16 +++--
 llvm/test/Transforms/LoopFusion/pr164082.ll | 65 +++++++++++++++++++++
 2 files changed, 75 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopFusion/pr164082.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 19eccb9e17020..9ffa602416b05 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1796,14 +1796,16 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    // Forget block dispositions as well, so that there are no dangling
-    // pointers to erased/free'ed blocks.
-    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
     mergeLatch(FC0, FC1);
 
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+    // since merging the latches may affect the dispositions.
+    SE.forgetBlockAndLoopDispositions();
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
@@ -2092,14 +2094,16 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    // Forget block dispositions as well, so that there are no dangling
-    // pointers to erased/free'ed blocks.
-    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
     mergeLatch(FC0, FC1);
 
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+    // since merging the latches may affect the dispositions.
+    SE.forgetBlockAndLoopDispositions();
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
diff --git a/llvm/test/Transforms/LoopFusion/pr164082.ll b/llvm/test/Transforms/LoopFusion/pr164082.ll
new file mode 100644
index 0000000000000..652557cef48f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/pr164082.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-fusion -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; STAT: 1 loop-fusion - Loops fused
+
+; C Code
+;
+;;  for (int i = 0; i < 100; ++i)
+;;      Array[i][i] = -i;
+;;  for (int row = 0; row < 100; ++row)
+;;      for (int col = 0; col < 100; ++col)
+;;          if (col != row)
+;;              Array[row][col] = row + col;
+;
+; Loop fusion should not crash anymore as now forgetBlockAndLoopDispositions()
+; is trigerred after mergeLatch() during the fusion.
+
+define i32 @forget_dispositions() nounwind {
+entry:
+  %Array = alloca [100 x [100 x i32]], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ]
+  %0 = trunc i64 %indvars.iv33 to i32
+  %sub = sub i32 0, %0
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33
+  store i32 %sub, ptr %arrayidx2, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, 100
+  br i1 %exitcond36, label %for.cond6.preheader, label %for.body
+
+for.cond6.preheader:                              ; preds = %for.body, %for.inc17
+  %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.inc14, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %2 = trunc i64 %indvars.iv29 to i32
+  %cmp9 = icmp eq i32 %1, %2
+  br i1 %cmp9, label %for.inc14, label %if.then
+
+if.then:                                          ; preds = %for.body8
+  %3 = add i64 %indvars.iv, %indvars.iv29
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv
+  %4 = trunc i64 %3 to i32
+  store i32 %4, ptr %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body8, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32
+  %exitcond28 = icmp eq i32 %lftr.wideiv27, 100
+  br i1 %exitcond28, label %for.inc17, label %for.body8
+
+for.inc17:                                        ; preds = %for.inc14
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, 100
+  br i1 %exitcond32, label %for.exit, label %for.cond6.preheader
+
+for.exit:                                    ; preds = %for.inc17
+  ret i32 0
+}

From b21949eb34037aa1811cc55609ab46c577feab63 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Tue, 4 Nov 2025 13:49:46 -0800
Subject: [PATCH 238/313] Revert "[mlir][memref]: Collapse strided unit dim
 even if strides are dynamic" (#166448)

Reverts llvm/llvm-project#157330

The original revision introduces a bug in `isGuaranteedCollapsible`. The
`memref<3x3x1x96xf32, strided<[288, 96, 96, 1], offset: 864>>` is no
longer collapsable with the change. The revision reverts the change to
bring back correct behavior. `stride` should be computed as `96` like
the old behavior in the failed iteration.


https://github.com/llvm/llvm-project/blob/92a1eb37122fa24e3045fbabdea2bf87127cace5/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp#L2597-L2605
---
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 10 +++++-----
 mlir/test/Dialect/MemRef/ops.mlir        |  7 +------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index e271ac58db327..1c21a2f270da6 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2568,11 +2568,6 @@ computeCollapsedLayoutMap(MemRefType srcType,
     auto trailingReassocs = ArrayRef<int64_t>(reassoc).drop_front();
     auto stride = SaturatedInteger::wrap(resultStrides[resultStrideIndex--]);
     for (int64_t idx : llvm::reverse(trailingReassocs)) {
-      // Dimensions of size 1 should be skipped, because their strides are
-      // meaningless and could have any arbitrary value.
-      if (srcShape[idx - 1] == 1)
-        continue;
-
       stride = stride * SaturatedInteger::wrap(srcShape[idx]);
 
       // Both source and result stride must have the same static value. In that
@@ -2587,6 +2582,11 @@ computeCollapsedLayoutMap(MemRefType srcType,
       if (strict && (stride.saturated || srcStride.saturated))
         return failure();
 
+      // Dimensions of size 1 should be skipped, because their strides are
+      // meaningless and could have any arbitrary value.
+      if (srcShape[idx - 1] == 1)
+        continue;
+
       if (!stride.saturated && !srcStride.saturated && stride != srcStride)
         return failure();
     }
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index b1db99bb3ad08..a90c9505a8405 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -440,8 +440,7 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
          %arg4: index,
          %arg5: index,
          %arg6: index,
-         %arg7: memref<4x?x4xf32>,
-         %arg8: memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>>) {
+         %arg7: memref<4x?x4xf32>) {
 //       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2]]
 //  CHECK-SAME:     memref<?x?x?xf32> into memref<?x?xf32>
   %0 = memref.collapse_shape %arg0 [[0, 1], [2]] :
@@ -490,10 +489,6 @@ func.func @expand_collapse_shape_dynamic(%arg0: memref<?x?x?xf32>,
 //       CHECK:   memref.expand_shape {{.*}} {{\[}}[0, 1], [2], [3, 4]]
   %4 = memref.expand_shape %arg7 [[0, 1], [2], [3, 4]] output_shape [2, 2, %arg4, 2, 2]
         : memref<4x?x4xf32> into memref<2x2x?x2x2xf32>
-
-//       CHECK:   memref.collapse_shape {{.*}} {{\[}}[0, 1], [2], [3]]
-//  CHECK-SAME:     memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>>
-  %5 = memref.collapse_shape %arg8 [[0, 1], [2], [3]] : memref<1x1x18x?xsi8, strided<[?, ?, ?, 1], offset: ?>> into memref<1x18x?xsi8, strided<[?, ?, 1], offset: ?>>
   return
 }
 

From cf73a0b102c012f30e5ee43638a78ed21e6b81b3 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 4 Nov 2025 21:55:57 +0000
Subject: [PATCH 239/313] [AArch64] Copy implicit def operands when creating
 LDP. (#164253)

Otherwise we might end up with undefined register uses. Copying implicit
uses can cause problems where a register is both defined and used in the
same LDP, so I have not tried to add them here.

Fixes #164230
---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 19 +++++
 llvm/test/CodeGen/AArch64/ldst-implicitop.mir | 80 +++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/ldst-implicitop.mir

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e69fa32967a79..2ab7bf19da410 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
       if (MOP.isReg() && MOP.isKill())
         DefinedInBB.addReg(MOP.getReg());
 
+  // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
+  // only copies implicit defs and makes sure that each operand is only added
+  // once in case of duplicates.
+  auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
+                             MachineBasicBlock::iterator MI2) {
+    SmallSetVector<Register, 4> Ops;
+    for (const MachineOperand &MO :
+         llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands()))
+      if (MO.isReg() && MO.isImplicit() && MO.isDef())
+        Ops.insert(MO.getReg());
+    for (const MachineOperand &MO :
+         llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands()))
+      if (MO.isReg() && MO.isImplicit() && MO.isDef())
+        Ops.insert(MO.getReg());
+    for (auto Op : Ops)
+      MIB.addDef(Op, RegState::Implicit);
+  };
+  CopyImplicitOps(I, Paired);
+
   // Erase the old instructions.
   I->eraseFromParent();
   Paired->eraseFromParent();
diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
new file mode 100644
index 0000000000000..34e8cf282669c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s
+# Check that we copy implicit operands.
+---
+name:            impdef_op1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_op1
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
+---
+name:            impdef_op2
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_op2
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128))
+    renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
+---
+name:            impdef_both
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_both
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20
+    ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    $q2 = ORRv16i8 $q20, killed $q20
+    $q3 = ORRv16i8 $q21, killed $q21
+    RET_ReallyLR
+...
+---
+name:            impdef_both_same
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_both_same
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...

From c93df83b0469902ad22de3e98a8325406a96b960 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 4 Nov 2025 14:03:45 -0800
Subject: [PATCH 240/313] [opt-viewer] Account for optimization records YAML
 files generated by LTO (#135059)

The optimization records YAML files generated by Clang's LTO pipeline
are named "\*.opt.ld.yaml" rather than "\*.opt.yaml". This patch adds
that pattern into the search list of `find_opt_files` as well.
---
 llvm/tools/opt-viewer/optrecord.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py
index b9244fd1ae739..07e21028535c4 100644
--- a/llvm/tools/opt-viewer/optrecord.py
+++ b/llvm/tools/opt-viewer/optrecord.py
@@ -344,6 +344,8 @@ def find_opt_files(*dirs_or_files):
                     d for d in subdirs if not os.path.ismount(os.path.join(dir, d))
                 ]
                 for file in files:
-                    if fnmatch.fnmatch(file, "*.opt.yaml*"):
+                    if fnmatch.fnmatch(file, "*.opt.yaml*") or fnmatch.fnmatch(
+                        file, "*.opt.ld.yaml*"
+                    ):
                         all.append(os.path.join(dir, file))
     return all

From d4222bf9e2175dc8d0707442802a222d652d0116 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 4 Nov 2025 23:29:13 +0100
Subject: [PATCH 241/313] [libc++] Use saturation builtins directly for
 {add,sub}_sat (#165228)

This doesn't improve performance (except with optimizations disabled),
since the compiler is able to fold our current implementation. However,
it does significantly reduce the amount of code the compiler has to sift
through, reducing compile times a bit.
---
 libcxx/include/__numeric/saturation_arithmetic.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
index 9bd3af12c9572..7a7410b5dea08 100644
--- a/libcxx/include/__numeric/saturation_arithmetic.h
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -30,6 +30,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
+#  if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101
+  return __builtin_elementwise_add_sat(__x, __y);
+#  else
   if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum)))
     return __sum;
   // Handle overflow
@@ -44,10 +47,14 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
       // Overflows if  (x < 0 && y < 0)
       return std::numeric_limits<_Tp>::min();
   }
+#  endif
 }
 
 template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
+#  if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101
+  return __builtin_elementwise_sub_sat(__x, __y);
+#  else
   if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub)))
     return __sub;
   // Handle overflow
@@ -63,6 +70,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
       // Overflows if (x < 0 && y > 0)
       return std::numeric_limits<_Tp>::min();
   }
+#  endif
 }
 
 template <__signed_or_unsigned_integer _Tp>

From a51d219ee7ac9ca96ade7639bff5097c8b79c130 Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Tue, 4 Nov 2025 14:32:06 -0800
Subject: [PATCH 242/313] [sanitizer_common] Add arm64e module type (#166018)

This will fix some symbolication failures on arm64e machines when the
symbolicator passes the (wrong) architecture string to atos.
---
 .../lib/asan/scripts/asan_symbolize.py        |  1 +
 .../lib/sanitizer_common/sanitizer_common.h   |  3 ++
 .../sanitizer_procmaps_mac.cpp                | 47 ++++++++++++-------
 .../tests/sanitizer_procmaps_test.cpp         |  2 +-
 4 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/compiler-rt/lib/asan/scripts/asan_symbolize.py b/compiler-rt/lib/asan/scripts/asan_symbolize.py
index 8ecd66c745119..091e9bcc9a796 100755
--- a/compiler-rt/lib/asan/scripts/asan_symbolize.py
+++ b/compiler-rt/lib/asan/scripts/asan_symbolize.py
@@ -59,6 +59,7 @@ def is_valid_arch(s):
         "armv7s",
         "armv7k",
         "arm64",
+        "arm64e",
         "powerpc64",
         "powerpc64le",
         "s390x",
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index ba85a0eb5a35e..b515b15b327d8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -737,6 +737,7 @@ enum ModuleArch {
   kModuleArchARMV7S,
   kModuleArchARMV7K,
   kModuleArchARM64,
+  kModuleArchARM64E,
   kModuleArchLoongArch64,
   kModuleArchRISCV64,
   kModuleArchHexagon
@@ -810,6 +811,8 @@ inline const char *ModuleArchToString(ModuleArch arch) {
       return "armv7k";
     case kModuleArchARM64:
       return "arm64";
+    case kModuleArchARM64E:
+      return "arm64e";
     case kModuleArchLoongArch64:
       return "loongarch64";
     case kModuleArchRISCV64:
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
index a9533d6fc04ca..a5ec85ae16460 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
@@ -20,18 +20,21 @@
 #include <mach/mach.h>
 
 // These are not available in older macOS SDKs.
-#ifndef CPU_SUBTYPE_X86_64_H
-#define CPU_SUBTYPE_X86_64_H  ((cpu_subtype_t)8)   /* Haswell */
-#endif
-#ifndef CPU_SUBTYPE_ARM_V7S
-#define CPU_SUBTYPE_ARM_V7S   ((cpu_subtype_t)11)  /* Swift */
-#endif
-#ifndef CPU_SUBTYPE_ARM_V7K
-#define CPU_SUBTYPE_ARM_V7K   ((cpu_subtype_t)12)
-#endif
-#ifndef CPU_TYPE_ARM64
-#define CPU_TYPE_ARM64        (CPU_TYPE_ARM | CPU_ARCH_ABI64)
-#endif
+#  ifndef CPU_SUBTYPE_X86_64_H
+#    define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */
+#  endif
+#  ifndef CPU_SUBTYPE_ARM_V7S
+#    define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */
+#  endif
+#  ifndef CPU_SUBTYPE_ARM_V7K
+#    define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12)
+#  endif
+#  ifndef CPU_TYPE_ARM64
+#    define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64)
+#  endif
+#  ifndef CPU_SUBTYPE_ARM64E
+#    define CPU_SUBTYPE_ARM64E ((cpu_subtype_t)2)
+#  endif
 
 namespace __sanitizer {
 
@@ -311,18 +314,26 @@ ModuleArch ModuleArchFromCpuType(cpu_type_t cputype, cpu_subtype_t cpusubtype) {
     case CPU_TYPE_I386:
       return kModuleArchI386;
     case CPU_TYPE_X86_64:
-      if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) return kModuleArchX86_64;
-      if (cpusubtype == CPU_SUBTYPE_X86_64_H) return kModuleArchX86_64H;
+      if (cpusubtype == CPU_SUBTYPE_X86_64_ALL)
+        return kModuleArchX86_64;
+      if (cpusubtype == CPU_SUBTYPE_X86_64_H)
+        return kModuleArchX86_64H;
       CHECK(0 && "Invalid subtype of x86_64");
       return kModuleArchUnknown;
     case CPU_TYPE_ARM:
-      if (cpusubtype == CPU_SUBTYPE_ARM_V6) return kModuleArchARMV6;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7) return kModuleArchARMV7;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7S) return kModuleArchARMV7S;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7K) return kModuleArchARMV7K;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V6)
+        return kModuleArchARMV6;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7)
+        return kModuleArchARMV7;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7S)
+        return kModuleArchARMV7S;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7K)
+        return kModuleArchARMV7K;
       CHECK(0 && "Invalid subtype of ARM");
       return kModuleArchUnknown;
     case CPU_TYPE_ARM64:
+      if (cpusubtype == CPU_SUBTYPE_ARM64E)
+        return kModuleArchARM64E;
       return kModuleArchARM64;
     default:
       CHECK(0 && "Invalid CPU type");
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
index 00542b944f516..c18e5bd9f3194 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
@@ -70,7 +70,7 @@ TEST(MemoryMapping, LoadedModuleArchAndUUID) {
         EXPECT_EQ(arch, kModuleArchI386);
       } else if (SANITIZER_WORDSIZE == 64) {
         EXPECT_TRUE(arch == kModuleArchX86_64 || arch == kModuleArchX86_64H ||
-                    arch == kModuleArchARM64);
+                    arch == kModuleArchARM64 || arch == kModuleArchARM64E);
       }
       const u8 *uuid = modules[i].uuid();
       u8 null_uuid[kModuleUUIDSize] = {0};

From ac5b6151976c70c8b676d3bc6ff82895fe0e1d01 Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Tue, 4 Nov 2025 15:15:33 -0800
Subject: [PATCH 243/313] [BPF] Remove dead code related to __bpf_trap global
 var (#166440)

In [1], the symbol __bpf_trap (macro BPF_TRAP) is removed if it is not
used in the code. In the discussion in [1], it is found that the branch
"if (Op.isSymbol())" is actually always false. Remove it to avoid
confusion.

  [1] https://github.com/llvm/llvm-project/pull/166003
---
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 378a72ab27dd5..abe081c0c76fd 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -176,10 +176,6 @@ void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
         if (const GlobalValue *GV = Op.getGlobal())
           if (GV->getName() == BPF_TRAP)
             SawTrapCall = true;
-      } else if (Op.isSymbol()) {
-        if (const MCSymbol *Sym = Op.getMCSymbol())
-          if (Sym->getName() == BPF_TRAP)
-            SawTrapCall = true;
       }
     }
   }

From 4cdeb7d292bf28d2be22eb2e3870d0389dc6c860 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Wed, 5 Nov 2025 07:24:59 +0800
Subject: [PATCH 244/313] [libc++] Remove guards for builtins for reference
 binding traits (#166288)

We're only supporting Clang 20+ and Apple Clang 17 now, where these
builtins are universally implemented.
---
 .../reference_constructs_from_temporary.h              |  2 +-
 .../__type_traits/reference_converts_from_temporary.h  |  2 +-
 libcxx/include/tuple                                   |  2 +-
 .../libcxx/type_traits/no_specializations.verify.cpp   | 10 +++-------
 .../reference_constructs_from_temporary.pass.cpp       |  3 ---
 .../reference_converts_from_temporary.pass.cpp         |  3 ---
 .../tuple.tuple/tuple.apply/make_from_tuple.verify.cpp |  4 ----
 7 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
index 3d097ce90cb09..a8325620414ea 100644
--- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary)
+#if _LIBCPP_STD_VER >= 23
 
 template <class _Tp, class _Up>
 struct _LIBCPP_NO_SPECIALIZATIONS reference_constructs_from_temporary
diff --git a/libcxx/include/__type_traits/reference_converts_from_temporary.h b/libcxx/include/__type_traits/reference_converts_from_temporary.h
index c68f1765af9d5..9c51225e53b8e 100644
--- a/libcxx/include/__type_traits/reference_converts_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_converts_from_temporary.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_converts_from_temporary)
+#if _LIBCPP_STD_VER >= 23
 
 template <class _Tp, class _Up>
 struct _LIBCPP_NO_SPECIALIZATIONS reference_converts_from_temporary
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 466f501b5f4f8..3c5330dd6e14e 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -1443,7 +1443,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp
 inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t)
   noexcept(noexcept(std::__make_from_tuple_impl<_Tp>(std::forward<_Tuple>(__t),
                     make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))) {
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary)
+#if _LIBCPP_STD_VER >= 23
   if constexpr (tuple_size_v<remove_reference_t<_Tuple>> == 1) {
     static_assert(!std::reference_constructs_from_temporary_v<_Tp, decltype(std::get<0>(std::declval<_Tuple>()))>,
                   "Attempted construction of reference element binds to a temporary whose lifetime has ended");
diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
index 897ae89365014..3fac952b9eb98 100644
--- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
@@ -154,14 +154,10 @@ SPECIALIZE_UTT(is_unbounded_array);     // expected-error 2 {{cannot be speciali
 #  endif
 
 #  if TEST_STD_VER >= 23
-SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(is_scoped_enum);       // expected-error 2 {{cannot be specialized}}
-#    if __has_builtin(__reference_constructs_from_temporary)
+SPECIALIZE_UTT(is_implicit_lifetime);                // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_scoped_enum);                      // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}}
-#    endif
-#    if __has_builtin(__reference_converts_from_temporary)
-SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}}
-#    endif
+SPECIALIZE_BTT(reference_converts_from_temporary);   // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 26
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
index ad53c8176cc92..84fe7cfb02208 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++23
 
-// These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: apple-clang-16, clang-19.1
-
 // <type_traits>
 
 // template<class T, class U> struct reference_constructs_from_temporary;
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
index 73cc4f3e29d5a..8319d9e1563fe 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++23
 
-// These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: apple-clang-16, clang-19.1
-
 // <type_traits>
 
 // template<class T, class U> struct reference_converts_from_temporary;
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
index 2dfbae9138864..12d778408d5ec 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
@@ -19,11 +19,7 @@
 #include "test_macros.h"
 
 void test() {
-  // FreeBSD ci use clang 19.1.1, which hasn't implement __reference_constructs_from_temporary.
-  // The static_assert inner std::make_from_tuple will not triggered.
-#if __has_builtin(__reference_constructs_from_temporary)
   // expected-error@*:* {{static assertion failed}}
-#endif
 
   // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5).
 #if TEST_STD_VER >= 26

From dc3b5141cdf704bec28edeed78d6d72ebe0444ae Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 4 Nov 2025 15:56:00 -0800
Subject: [PATCH 245/313] [llvm][mustache] Optimize accessor splitting with a
 single pass (#159198)

The splitMustacheString function previously used a loop of
StringRef::split and StringRef::trim. This was inefficient as
it scanned each segment of the accessor string multiple times.

This change introduces a custom splitAndTrim function that
performs both operations in a single pass over the string,
reducing redundant work and improving performance, most notably
in the number of CPU cycles executed.

| Metric | Baseline | Optimized | Change |
| --- | --- | --- | --- |
| Time (ms) | 35\.57 | 35\.36 | \-0.59% |
| Cycles | 34\.91M | 34\.26M | \-1.86% |
| Instructions | 85\.54M | 85\.24M | \-0.35% |
| Branch Misses | 111\.9K | 112\.2K | \+0.27% |
| Cache Misses | 242\.1K | 239\.9K | \-0.91% |
---
 llvm/lib/Support/Mustache.cpp | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 012e1ffd534d2..9eb1ec2b8425c 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) {
   return isFalsey(*V);
 }
 
+static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) {
+  size_t CurrentPos = 0;
+  while (CurrentPos < Str.size()) {
+    // Find the next delimiter.
+    size_t DelimiterPos = Str.find('.', CurrentPos);
+
+    // If no delimiter is found, process the rest of the string.
+    if (DelimiterPos == StringRef::npos)
+      DelimiterPos = Str.size();
+
+    // Get the current part, which may have whitespace.
+    StringRef Part = Str.slice(CurrentPos, DelimiterPos);
+
+    // Manually trim the part without creating a new string object.
+    size_t Start = Part.find_first_not_of(" \t\r\n");
+    if (Start != StringRef::npos) {
+      size_t End = Part.find_last_not_of(" \t\r\n");
+      Tokens.push_back(Part.slice(Start, End + 1));
+    }
+
+    // Move past the delimiter for the next iteration.
+    CurrentPos = DelimiterPos + 1;
+  }
+}
+
 static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
   // We split the mustache string into an accessor.
   // For example:
@@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
     // It's a literal, so it doesn't need to be saved.
     Tokens.push_back(".");
   } else {
-    while (!Str.empty()) {
-      StringRef Part;
-      std::tie(Part, Str) = Str.split('.');
-      // Each part of the accessor needs to be saved to the arena
-      // to ensure it has a stable address.
-      Tokens.push_back(Part.trim());
-    }
+    splitAndTrim(Str, Tokens);
   }
   // Now, allocate memory for the array of StringRefs in the arena.
   StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());

From 2b4ac6629782fc527eb8ef232f6d14d48186a7f4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 15:57:39 -0800
Subject: [PATCH 246/313] AMDGPU: Cleanup and modernize limit-coalesce.mir test
 (#166465)

---
 llvm/test/CodeGen/AMDGPU/limit-coalesce.mir | 63 ++++++---------------
 1 file changed, 18 insertions(+), 45 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index fa52b96e9ea95..02eda2c4822c2 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -6,40 +6,12 @@
 # No more registers shall be defined
 ---
 name:            main
-alignment:       1
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
 tracksRegLiveness: true
 registers:
-  - { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
-  - { id: 2, class: vreg_64, preferred-register: '%2' }
-  - { id: 3, class: vreg_64 }
-  - { id: 4, class: vreg_64 }
-  - { id: 5, class: vreg_64 }
-  - { id: 6, class: vreg_96 }
-  - { id: 7, class: vreg_96 }
-  - { id: 8, class: vreg_128 }
-  - { id: 9, class: vreg_128 }
-liveins:
-  - { reg: '$sgpr6', virtual-reg: '%1' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
+  - { id: 0, class: sreg_32_xm0, preferred-register: '%0' }
+  - { id: 1, class: vreg_64, preferred-register: '%1' }
 body:             |
-  bb.0.entry:
+  bb.0:
     liveins: $sgpr0, $vgpr0_vgpr1
 
     ; CHECK-LABEL: name: main
@@ -59,20 +31,21 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
-    %3 = IMPLICIT_DEF
-    undef %4.sub0 = COPY $sgpr0
-    %4.sub1 = COPY %3.sub0
-    undef %5.sub0 = COPY %4.sub1
-    %5.sub1 = COPY %4.sub0
-    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr
+    %2:vreg_64 = IMPLICIT_DEF
+    undef %3.sub0:vreg_64 = COPY $sgpr0
+    %3.sub1:vreg_64 = COPY %2.sub0
+    undef %4.sub0:vreg_64 = COPY %3.sub1
+    %4.sub1:vreg_64 = COPY %3.sub0
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr
 
-    %6 = IMPLICIT_DEF
-    undef %7.sub0_sub1 = COPY %6
-    %7.sub2 = COPY %3.sub0
-    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr
+    %5:vreg_96 = IMPLICIT_DEF
+    undef %6.sub0_sub1:vreg_96 = COPY %5
+    %6.sub2:vreg_96 = COPY %2.sub0
+    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr
+
+    %7:vreg_128 = IMPLICIT_DEF
+    undef %8.sub0_sub1_sub2:vreg_128 = COPY %7
+    %8.sub3:vreg_128 = COPY %2.sub0
+    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr
 
-    %8 = IMPLICIT_DEF
-    undef %9.sub0_sub1_sub2 = COPY %8
-    %9.sub3 = COPY %3.sub0
-    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr
 ...

From 0fd029858a819dfbc5e4ec18d456d055359b0317 Mon Sep 17 00:00:00 2001
From: Shubh Pachchigar <33875085+shubhe25p@users.noreply.github.com>
Date: Tue, 4 Nov 2025 16:18:53 -0800
Subject: [PATCH 247/313] [libc] Add chown and getgid implementations (#166434)

Implements chown and getgid per the POSIX specification and adds
corresponding unit tests. getgid is added as it is required by the chown
unit tests. This PR will address #165785

Co-authored-by: shubh@DOE <shubhp@mbm3a24.local>
---
 libc/config/linux/x86_64/entrypoints.txt |  2 +
 libc/hdr/types/CMakeLists.txt            |  8 ++++
 libc/hdr/types/gid_t.h                   | 22 ++++++++++
 libc/include/unistd.yaml                 | 15 +++++++
 libc/src/unistd/CMakeLists.txt           | 14 +++++++
 libc/src/unistd/chown.h                  | 22 ++++++++++
 libc/src/unistd/getgid.h                 | 22 ++++++++++
 libc/src/unistd/linux/CMakeLists.txt     | 28 +++++++++++++
 libc/src/unistd/linux/chown.cpp          | 29 ++++++++++++++
 libc/src/unistd/linux/getgid.cpp         | 23 +++++++++++
 libc/test/src/unistd/CMakeLists.txt      | 30 ++++++++++++++
 libc/test/src/unistd/chown_test.cpp      | 51 ++++++++++++++++++++++++
 libc/test/src/unistd/getgid_test.cpp     | 15 +++++++
 13 files changed, 281 insertions(+)
 create mode 100644 libc/hdr/types/gid_t.h
 create mode 100644 libc/src/unistd/chown.h
 create mode 100644 libc/src/unistd/getgid.h
 create mode 100644 libc/src/unistd/linux/chown.cpp
 create mode 100644 libc/src/unistd/linux/getgid.cpp
 create mode 100644 libc/test/src/unistd/chown_test.cpp
 create mode 100644 libc/test/src/unistd/getgid_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 7a8d74a4e5da9..a44e2041e57f2 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -326,6 +326,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     # unistd.h entrypoints
     libc.src.unistd.access
     libc.src.unistd.chdir
+    libc.src.unistd.chown
     libc.src.unistd.close
     libc.src.unistd.dup
     libc.src.unistd.dup2
@@ -344,6 +345,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.getppid
     libc.src.unistd.getsid
     libc.src.unistd.gettid
+    libc.src.unistd.getgid
     libc.src.unistd.getuid
     libc.src.unistd.isatty
     libc.src.unistd.link
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 225843924c243..433c47b174766 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -479,3 +479,11 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.struct_rlimit
     libc.include.sys_resource
 )
+
+add_proxy_header_library(
+  gid_t
+  HDRS
+    gid_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.gid_t
+)
diff --git a/libc/hdr/types/gid_t.h b/libc/hdr/types/gid_t.h
new file mode 100644
index 0000000000000..bc274aaa9a8a8
--- /dev/null
+++ b/libc/hdr/types/gid_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for gid_t ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_GID_T_H
+#define LLVM_LIBC_HDR_TYPES_GID_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/gid_t.h"
+
+#else // Overlay mode
+
+#include <sys/types.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_GID_T_H
diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml
index 2ff86eafaf550..0e5b22e627b67 100644
--- a/libc/include/unistd.yaml
+++ b/libc/include/unistd.yaml
@@ -3,6 +3,7 @@ header_template: unistd.h.def
 macros: []
 types:
   - type_name: uid_t
+  - type_name: gid_t
   - type_name: ssize_t
   - type_name: size_t
   - type_name: pid_t
@@ -54,6 +55,14 @@ functions:
     return_type: int
     arguments:
       - type: const char *
+  - name: chown
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: uid_t
+      - type: gid_t
   - name: close
     standards:
       - POSIX
@@ -195,6 +204,12 @@ functions:
     return_type: uid_t
     arguments:
       - type: void
+  - name: getgid
+    standards:
+      - POSIX
+    return_type: gid_t
+    arguments:
+      - type: void
   - name: isatty
     standards:
       - POSIX
diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt
index 78c3bf8442fab..337480cbbf928 100644
--- a/libc/src/unistd/CMakeLists.txt
+++ b/libc/src/unistd/CMakeLists.txt
@@ -27,6 +27,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.chdir
 )
 
+add_entrypoint_object(
+  chown
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.chown
+)
+
 add_entrypoint_object(
   close
   ALIAS
@@ -160,6 +167,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.getuid
 )
 
+add_entrypoint_object(
+  getgid
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getgid
+)
+
 add_entrypoint_object(
   isatty
   ALIAS
diff --git a/libc/src/unistd/chown.h b/libc/src/unistd/chown.h
new file mode 100644
index 0000000000000..84a8eba2cb2e6
--- /dev/null
+++ b/libc/src/unistd/chown.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for chown -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_CHOWN_H
+#define LLVM_LIBC_SRC_UNISTD_CHOWN_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/types/uid_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int chown(const char *path, uid_t owner, gid_t group);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_CHOWN_H
diff --git a/libc/src/unistd/getgid.h b/libc/src/unistd/getgid.h
new file mode 100644
index 0000000000000..eed0b20d688b1
--- /dev/null
+++ b/libc/src/unistd/getgid.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for getgid ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_GETGID_H
+#define LLVM_LIBC_SRC_UNISTD_GETGID_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/unistd_macros.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+gid_t getgid();
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_GETGID_H
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 4eb3c7d3d7fae..c2dacc6456e27 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -25,6 +25,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  chown
+  SRCS
+    chown.cpp
+  HDRS
+    ../chown.h
+  DEPENDS
+    libc.hdr.types.uid_t
+    libc.hdr.types.gid_t
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   close
   SRCS
@@ -276,6 +290,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  getgid
+  SRCS
+    getgid.cpp
+  HDRS
+    ../getgid.h
+  DEPENDS
+    libc.hdr.types.gid_t
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+)
+
 add_entrypoint_object(
   getuid
   SRCS
diff --git a/libc/src/unistd/linux/chown.cpp b/libc/src/unistd/linux/chown.cpp
new file mode 100644
index 0000000000000..c7bf1703ffe57
--- /dev/null
+++ b/libc/src/unistd/linux/chown.cpp
@@ -0,0 +1,29 @@
+//===-- Linux implementation of chown -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/chown.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, chown, (const char *path, uid_t owner, gid_t group)) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_chown, path, owner, group);
+  if (ret < 0) {
+    libc_errno = -ret;
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/linux/getgid.cpp b/libc/src/unistd/linux/getgid.cpp
new file mode 100644
index 0000000000000..1656fd601d843
--- /dev/null
+++ b/libc/src/unistd/linux/getgid.cpp
@@ -0,0 +1,23 @@
+//===-- Linux implementation of getgid ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/getgid.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(gid_t, getgid, ()) {
+  return LIBC_NAMESPACE::syscall_impl<gid_t>(SYS_getgid);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index 44f28fff9ad39..07070535459ec 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -36,6 +36,26 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
+add_libc_unittest(
+  chown_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    chown_test.cpp
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.src.errno.errno
+    libc.src.unistd.chown
+    libc.src.unistd.close
+    libc.src.unistd.unlink
+    libc.src.fcntl.open
+    libc.src.unistd.getuid
+    libc.src.unistd.getgid
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
 add_libc_unittest(
   dup_test
   SUITE
@@ -437,6 +457,16 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoCheckingTest
 )
 
+add_libc_unittest(
+  getgid_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    getgid_test.cpp
+  DEPENDS
+    libc.src.unistd.getgid
+)
+
 add_libc_unittest(
   getpid_test
   SUITE
diff --git a/libc/test/src/unistd/chown_test.cpp b/libc/test/src/unistd/chown_test.cpp
new file mode 100644
index 0000000000000..8b1f783273624
--- /dev/null
+++ b/libc/test/src/unistd/chown_test.cpp
@@ -0,0 +1,51 @@
+//===-- Unittests for chown -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/open.h"
+#include "src/unistd/chown.h"
+#include "src/unistd/close.h"
+#include "src/unistd/getgid.h"
+#include "src/unistd/getuid.h"
+#include "src/unistd/unlink.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "hdr/fcntl_macros.h"
+#include <sys/stat.h>
+
+using LlvmLibcChownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcChownTest, ChownSuccess) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  uid_t my_uid = LIBC_NAMESPACE::getuid();
+  gid_t my_gid = LIBC_NAMESPACE::getgid();
+  constexpr const char *FILENAME = "chown.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  // Create a test file.
+  int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(write_fd, 0);
+
+  // Change the ownership of the file.
+  ASSERT_THAT(LIBC_NAMESPACE::chown(TEST_FILE, my_uid, my_gid), Succeeds(0));
+
+  // Close the file descriptor.
+  ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0));
+
+  // Clean up the test file.
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+}
+
+TEST_F(LlvmLibcChownTest, ChownNonExistentFile) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+  ASSERT_THAT(LIBC_NAMESPACE::chown("non-existent-file", 1000, 1000),
+              Fails(ENOENT));
+}
diff --git a/libc/test/src/unistd/getgid_test.cpp b/libc/test/src/unistd/getgid_test.cpp
new file mode 100644
index 0000000000000..77dbad2f18e00
--- /dev/null
+++ b/libc/test/src/unistd/getgid_test.cpp
@@ -0,0 +1,15 @@
+//===-- Unittests for getgid ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/getgid.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcGetGidTest, SmokeTest) {
+  // getgid always succeeds. So, we just call it as a smoke test.
+  LIBC_NAMESPACE::getgid();
+}

From 8648beff758db89e2eb816bd01d9b3c37e6aa3f9 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Wed, 5 Nov 2025 08:20:29 +0800
Subject: [PATCH 248/313] [NFC][libclc] Rename clc_cbrt.inc to clc_cbrt.h
 (#166330)

__clc_cbrt functions are declared in clc_cbrt.inc. Rename to .h for
consistency with other headers.
---
 libclc/clc/include/clc/math/{clc_cbrt.inc => clc_cbrt.h} | 0
 libclc/clc/lib/generic/math/clc_cbrt.cl                  | 1 +
 libclc/opencl/lib/generic/math/cbrt.cl                   | 2 +-
 3 files changed, 2 insertions(+), 1 deletion(-)
 rename libclc/clc/include/clc/math/{clc_cbrt.inc => clc_cbrt.h} (100%)

diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.h
similarity index 100%
rename from libclc/clc/include/clc/math/clc_cbrt.inc
rename to libclc/clc/include/clc/math/clc_cbrt.h
diff --git a/libclc/clc/lib/generic/math/clc_cbrt.cl b/libclc/clc/lib/generic/math/clc_cbrt.cl
index 105f6329d5bad..935b7b7eae78c 100644
--- a/libclc/clc/lib/generic/math/clc_cbrt.cl
+++ b/libclc/clc/lib/generic/math/clc_cbrt.cl
@@ -8,6 +8,7 @@
 
 #include <clc/clc_convert.h>
 #include <clc/internal/clc.h>
+#include <clc/math/clc_cbrt.h>
 #include <clc/math/clc_copysign.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_fma.h>
diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl
index 0d670150ed4c9..7de61436522b3 100644
--- a/libclc/opencl/lib/generic/math/cbrt.cl
+++ b/libclc/opencl/lib/generic/math/cbrt.cl
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_cbrt.inc>
+#include <clc/math/clc_cbrt.h>
 #include <clc/opencl/math/cbrt.h>
 
 #define __CLC_FUNCTION cbrt

From d8e56988f87ecdfa0b9bfdb269f6d904840cc6dd Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com>
Date: Wed, 5 Nov 2025 01:20:51 +0100
Subject: [PATCH 249/313] [libc] Add printf error handling (with fixes)
 (#166382)

https://github.com/llvm/llvm-project/issues/159474

Resubmitting https://github.com/llvm/llvm-project/pull/162876 with fixes
as it broke some buildbots:
 - Fix comparisons of integer expressions of different signedness
- Not check for specific errnos in tests, as they might not be available
on all platforms
---
 libc/src/stdio/CMakeLists.txt                 | 24 +++++++++
 libc/src/stdio/asprintf.cpp                   | 18 ++++++-
 libc/src/stdio/baremetal/CMakeLists.txt       |  8 +++
 libc/src/stdio/baremetal/printf.cpp           | 23 ++++++--
 libc/src/stdio/baremetal/vprintf.cpp          | 23 ++++++--
 libc/src/stdio/generic/CMakeLists.txt         |  4 ++
 libc/src/stdio/generic/fprintf.cpp            | 17 +++++-
 libc/src/stdio/generic/printf.cpp             | 17 +++++-
 libc/src/stdio/generic/vfprintf.cpp           | 17 +++++-
 libc/src/stdio/generic/vprintf.cpp            | 17 +++++-
 libc/src/stdio/printf_core/CMakeLists.txt     | 25 +++++++++
 libc/src/stdio/printf_core/core_structs.h     | 19 ++++---
 libc/src/stdio/printf_core/error_mapper.h     | 21 ++++++++
 .../stdio/printf_core/generic/CMakeLists.txt  |  8 +++
 .../stdio/printf_core/generic/error_mapper.h  | 49 +++++++++++++++++
 .../stdio/printf_core/linux/CMakeLists.txt    |  8 +++
 .../stdio/printf_core/linux/error_mapper.h    | 54 +++++++++++++++++++
 libc/src/stdio/printf_core/printf_main.h      |  9 ++--
 .../stdio/printf_core/vasprintf_internal.h    | 20 +++----
 .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++++++-----
 .../stdio/printf_core/write_int_converter.h   |  4 +-
 libc/src/stdio/printf_core/writer.h           |  8 +--
 libc/src/stdio/snprintf.cpp                   | 19 ++++++-
 libc/src/stdio/sprintf.cpp                    | 18 ++++++-
 libc/src/stdio/vasprintf.cpp                  | 16 +++++-
 libc/src/stdio/vsnprintf.cpp                  | 19 ++++++-
 libc/src/stdio/vsprintf.cpp                   | 17 +++++-
 libc/src/stdlib/CMakeLists.txt                |  6 +++
 libc/src/stdlib/strfromd.cpp                  | 11 +++-
 libc/src/stdlib/strfromf.cpp                  | 11 +++-
 libc/src/stdlib/strfroml.cpp                  | 11 +++-
 libc/src/time/strftime_core/strftime_main.h   |  3 +-
 libc/test/src/stdio/CMakeLists.txt            |  2 +
 libc/test/src/stdio/fprintf_test.cpp          | 24 +++++++++
 .../src/stdio/printf_core/converter_test.cpp  | 30 +++++------
 .../src/stdio/printf_core/writer_test.cpp     | 32 +++++------
 libc/test/src/stdio/snprintf_test.cpp         | 15 ++++++
 libc/test/src/stdio/vfprintf_test.cpp         |  5 ++
 libc/test/src/stdlib/StrfromTest.h            | 19 ++++++-
 39 files changed, 588 insertions(+), 104 deletions(-)
 create mode 100644 libc/src/stdio/printf_core/error_mapper.h
 create mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt
 create mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h
 create mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt
 create mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h

diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index b0a6ef1e291b5..c75c8b11be2b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -125,6 +125,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -136,6 +140,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -146,6 +154,10 @@ add_entrypoint_object(
     asprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -157,6 +169,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -168,6 +184,10 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -178,6 +198,10 @@ add_entrypoint_object(
     vasprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
+    libc.src.stdio.printf_core.core_structs
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_subdirectory(printf_core)
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index f8cfb74ce48ea..0991dfca6a059 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -7,8 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/asprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -22,8 +26,18 @@ LLVM_LIBC_FUNCTION(int, asprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret = printf_core::vasprintf_internal(buffer, format, args);
-  return ret;
+  auto ret_val = printf_core::vasprintf_internal(buffer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
index 548938f885c94..bfeff0e2b5880 100644
--- a/libc/src/stdio/baremetal/CMakeLists.txt
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -29,8 +29,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -87,8 +91,12 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
+    libc.src.stdio.printf_core.error_mapper
+    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.libc_errno
+    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index 7253c6549a4e4..5a9b19ff20471 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/printf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -42,13 +45,25 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  int retval = printf_core::printf_main(&writer, format, args);
+  auto retval = printf_core::printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(retval.error());
+    return -1;
+  }
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK)
-    retval = flushval;
+  if (flushval != printf_core::WRITE_OK) {
+    libc_errno = printf_core::internal_error_to_errno(-flushval);
+    return -1;
+  }
 
-  return retval;
+  if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(retval.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index ab02533f14911..c172b368d15f3 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -40,13 +43,25 @@ LLVM_LIBC_FUNCTION(int, vprintf,
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  int retval = printf_core::printf_main(&writer, format, args);
+  auto retval = printf_core::printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(retval.error());
+    return -1;
+  }
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK)
-    retval = flushval;
+  if (flushval != printf_core::WRITE_OK) {
+    libc_errno = printf_core::internal_error_to_errno(-flushval);
+    return -1;
+  }
 
-  return retval;
+  if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(retval.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 6361822b61999..71055edea3d9e 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -393,7 +393,11 @@ add_generic_entrypoint_object(
 list(APPEND fprintf_deps
       libc.hdr.types.FILE
       libc.src.__support.arg_list
+      libc.src.__support.CPP.limits
+      libc.src.__support.libc_errno
       libc.src.stdio.printf_core.vfprintf_internal
+      libc.src.stdio.printf_core.core_structs
+      libc.src.stdio.printf_core.error_mapper
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
index 087aeadfc52c5..b2033901557a0 100644
--- a/libc/src/stdio/generic/fprintf.cpp
+++ b/libc/src/stdio/generic/fprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -27,8 +30,18 @@ LLVM_LIBC_FUNCTION(int, fprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp
index bb7c7c86f843f..8d159d5c70870 100644
--- a/libc/src/stdio/generic/printf.cpp
+++ b/libc/src/stdio/generic/printf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/printf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -31,9 +34,19 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  int ret_val = printf_core::vfprintf_internal(
+  auto ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  return ret_val;
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
index 01f4265f118a6..a26f082ed9347 100644
--- a/libc/src/stdio/generic/vfprintf.cpp
+++ b/libc/src/stdio/generic/vfprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -24,8 +27,18 @@ LLVM_LIBC_FUNCTION(int, vfprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  int ret_val = printf_core::vfprintf_internal(stream, format, args);
-  return ret_val;
+  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp
index 08d71515646ed..ae2160219f2bb 100644
--- a/libc/src/stdio/generic/vprintf.cpp
+++ b/libc/src/stdio/generic/vprintf.cpp
@@ -8,9 +8,12 @@
 
 #include "src/stdio/vprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -29,9 +32,19 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  int ret_val = printf_core::vfprintf_internal(
+  auto ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  return ret_val;
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index ee66145e60156..624129b2b36e7 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -32,6 +32,17 @@ if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
 
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+else()
+  add_subdirectory(generic)
+endif()
+
+set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper)
+if(NOT TARGET ${target_error_mapper})
+    set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper)
+endif()
+
 add_header_library(
   printf_config
   HDRS
@@ -47,6 +58,7 @@ add_header_library(
     libc.include.inttypes
     libc.src.__support.CPP.string_view
     libc.src.__support.FPUtil.fp_bits
+    libc.hdr.errno_macros
 )
 
 add_header_library(
@@ -125,6 +137,7 @@ add_header_library(
     .writer
     .core_structs
     libc.src.__support.arg_list
+    libc.src.__support.error_or
 )
 
 add_header_library(
@@ -136,10 +149,20 @@ add_header_library(
     libc.hdr.func.free
     libc.hdr.func.realloc
     libc.src.__support.arg_list
+    libc.src.__support.error_or
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
 )
 
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    ${target_error_mapper}
+    libc.src.__support.macros.properties.architectures
+)
+
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
   # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
@@ -152,8 +175,10 @@ add_header_library(
     vfprintf_internal.h
   DEPENDS
     libc.src.__support.File.file
+    libc.src.__support.error_or
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
   ${use_system_file}
 )
+
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index e27f77b6b594a..0d41f2244d8da 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -132,14 +132,17 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
 
 // This is the value to be returned by conversions when no error has occurred.
 constexpr int WRITE_OK = 0;
-// These are the printf return values for when an error has occurred. They are
-// all negative, and should be distinct.
-constexpr int FILE_WRITE_ERROR = -1;
-constexpr int FILE_STATUS_ERROR = -2;
-constexpr int NULLPTR_WRITE_ERROR = -3;
-constexpr int INT_CONVERSION_ERROR = -4;
-constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
-constexpr int ALLOCATION_ERROR = -6;
+// These are the error return values used by the printf engine when an
+// error has occurred. They are all large negative, distinct values starting
+// from -1000 to not overlap with system errors.
+constexpr int FILE_WRITE_ERROR = -1001;
+constexpr int FILE_STATUS_ERROR = -1002;
+constexpr int NULLPTR_WRITE_ERROR = -1003;
+constexpr int INT_CONVERSION_ERROR = -1004;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -1005;
+constexpr int ALLOCATION_ERROR = -1006;
+constexpr int OVERFLOW_ERROR = -1007;
+
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h
new file mode 100644
index 0000000000000..23030930133a1
--- /dev/null
+++ b/libc/src/stdio/printf_core/error_mapper.h
@@ -0,0 +1,21 @@
+//===-- Error mapper for printf ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+// Maps internal errors to the available errnos on the platform.
+#if defined(__linux__)
+#include "linux/error_mapper.h"
+#else
+#include "generic/error_mapper.h"
+#endif
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt
new file mode 100644
index 0000000000000..2f0143d992e31
--- /dev/null
+++ b/libc/src/stdio/printf_core/generic/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    libc.src.stdio.printf_core.core_structs
+    libc.hdr.errno_macros
+)
diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h
new file mode 100644
index 0000000000000..d8cdd2cc2dbaa
--- /dev/null
+++ b/libc/src/stdio/printf_core/generic/error_mapper.h
@@ -0,0 +1,49 @@
+//===-- Generic implementation of error mapper ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
+
+#include "hdr/errno_macros.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace printf_core {
+
+LIBC_INLINE static int internal_error_to_errno(int internal_error) {
+  // System error occured, return error as is.
+  if (internal_error < 1001 && internal_error > 0) {
+    return internal_error;
+  }
+
+  // Map internal error to the available C standard errnos.
+  switch (-internal_error) {
+  case WRITE_OK:
+    return 0;
+  case FILE_WRITE_ERROR:
+  case FILE_STATUS_ERROR:
+  case NULLPTR_WRITE_ERROR:
+  case ALLOCATION_ERROR:
+    return EDOM;
+  case INT_CONVERSION_ERROR:
+  case FIXED_POINT_CONVERSION_ERROR:
+  case OVERFLOW_ERROR:
+    return ERANGE;
+  default:
+    LIBC_ASSERT(
+        false &&
+        "Invalid internal printf error code passed to internal_error_to_errno");
+    return EDOM;
+  }
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..2f0143d992e31
--- /dev/null
+++ b/libc/src/stdio/printf_core/linux/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_header_library(
+  error_mapper
+  HDRS
+    error_mapper.h
+  DEPENDS
+    libc.src.stdio.printf_core.core_structs
+    libc.hdr.errno_macros
+)
diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h
new file mode 100644
index 0000000000000..3c2fe663072d0
--- /dev/null
+++ b/libc/src/stdio/printf_core/linux/error_mapper.h
@@ -0,0 +1,54 @@
+//===-- Linux implementation of error mapper --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
+
+#include "hdr/errno_macros.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace printf_core {
+
+LIBC_INLINE static int internal_error_to_errno(int internal_error) {
+  // System error occured, return error as is.
+  if (internal_error < 1001 && internal_error > 0) {
+    return internal_error;
+  }
+
+  // Map internal error to POSIX errnos.
+  switch (-internal_error) {
+  case WRITE_OK:
+    return 0;
+  case FILE_WRITE_ERROR:
+    return EIO;
+  case FILE_STATUS_ERROR:
+    return EIO;
+  case NULLPTR_WRITE_ERROR:
+    return EINVAL;
+  case INT_CONVERSION_ERROR:
+    return ERANGE;
+  case FIXED_POINT_CONVERSION_ERROR:
+    return EINVAL;
+  case ALLOCATION_ERROR:
+    return ENOMEM;
+  case OVERFLOW_ERROR:
+    return EOVERFLOW;
+  default:
+    LIBC_ASSERT(
+        false &&
+        "Invalid internal printf error code passed to internal_error_to_errno");
+    return EINVAL;
+  }
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h
index 57f29858d5298..1c7a7237c097d 100644
--- a/libc/src/stdio/printf_core/printf_main.h
+++ b/libc/src/stdio/printf_core/printf_main.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H
 
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -22,8 +23,9 @@ namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 template <WriteMode write_mode>
-int printf_main(Writer<write_mode> *writer, const char *__restrict str,
-                internal::ArgList &args) {
+ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
+                            const char *__restrict str,
+                            internal::ArgList &args) {
   Parser<internal::ArgList> parser(str, args);
   int result = 0;
   for (FormatSection cur_section = parser.get_next_section();
@@ -33,9 +35,8 @@ int printf_main(Writer<write_mode> *writer, const char *__restrict str,
       result = convert(writer, cur_section);
     else
       result = writer->write(cur_section.raw_string);
-
     if (result < 0)
-      return result;
+      return Error(-result);
   }
 
   return writer->get_chars_written();
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
index 283d8df2810fb..41df17b67f35b 100644
--- a/libc/src/stdio/printf_core/vasprintf_internal.h
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -10,6 +10,7 @@
 #include "hdr/func/malloc.h"
 #include "hdr/func/realloc.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
@@ -29,7 +30,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
   if (new_buff == nullptr) {
     if (wb->buff != wb->init_buff)
       free(wb->buff);
-    return printf_core::ALLOCATION_ERROR;
+    return ALLOCATION_ERROR;
   }
   if (isBuffOnStack)
     inline_memcpy(new_buff, wb->buff, wb->buff_cur);
@@ -42,27 +43,28 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
 
 constexpr size_t DEFAULT_BUFFER_SIZE = 200;
 
-LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format,
-                                   internal::ArgList args) {
+LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret,
+                                               const char *__restrict format,
+                                               internal::ArgList args) {
   char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb(
       init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook);
   printf_core::Writer writer(wb);
 
   auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (ret_val < 0) {
+  if (!ret_val.has_value()) {
     *ret = nullptr;
-    return -1;
+    return ret_val;
   }
   if (wb.buff == init_buff_on_stack) {
-    *ret = static_cast<char *>(malloc(ret_val + 1));
+    *ret = static_cast<char *>(malloc(ret_val.value() + 1));
     if (ret == nullptr)
-      return printf_core::ALLOCATION_ERROR;
-    inline_memcpy(*ret, wb.buff, ret_val);
+      return Error(ALLOCATION_ERROR);
+    inline_memcpy(*ret, wb.buff, ret_val.value());
   } else {
     *ret = wb.buff;
   }
-  (*ret)[ret_val] = '\0';
+  (*ret)[ret_val.value()] = '\0';
   return ret_val;
 }
 } // namespace printf_core
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index 630de9d9d43dd..564441d3bf51a 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/attributes.h" // For LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -35,8 +36,8 @@ LIBC_INLINE void funlockfile(FILE *f) {
   reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock();
 }
 
-LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
-                                   FILE *f) {
+LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                         size_t nmemb, FILE *f) {
   return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked(
       ptr, size * nmemb);
 }
@@ -47,9 +48,11 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); }
 
 LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); }
 
-LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
-                                   ::FILE *f) {
-  return ::fwrite_unlocked(ptr, size, nmemb, f);
+LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
+                                         size_t nmemb, ::FILE *f) {
+  // Need to use system errno in this case, as system write will set this errno
+  // which we need to propagate back into our code.
+  return {::fwrite_unlocked(ptr, size, nmemb, f), errno};
 }
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace internal
@@ -60,26 +63,38 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) {
   ::FILE *target_file = reinterpret_cast<::FILE *>(fp);
   // Write new_str to the target file. The logic preventing a zero-length write
   // is in the writer, so we don't check here.
-  size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char),
-                                             new_str.size(), target_file);
-  if (written != new_str.size() || internal::ferror_unlocked(target_file))
+  auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char),
+                                                new_str.size(), target_file);
+  // Propagate actual system error in FileIOResult.
+  if (write_result.has_error())
+    return -write_result.error;
+
+  // In case short write occured or error was not set on FileIOResult for some
+  // reason.
+  if (write_result.value != new_str.size() ||
+      internal::ferror_unlocked(target_file))
     return FILE_WRITE_ERROR;
+
   return WRITE_OK;
 }
 
-LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
-                                  const char *__restrict format,
-                                  internal::ArgList &args) {
+LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream,
+                                              const char *__restrict format,
+                                              internal::ArgList &args) {
   constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb(
       buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream));
   Writer writer(wb);
   internal::flockfile(stream);
-  int retval = printf_main(&writer, format, args);
+  auto retval = printf_main(&writer, format, args);
+  if (!retval.has_value()) {
+    internal::funlockfile(stream);
+    return retval;
+  }
   int flushval = wb.overflow_write("");
   if (flushval != WRITE_OK)
-    retval = flushval;
+    retval = Error(-flushval);
   internal::funlockfile(stream);
   return retval;
 }
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index efcff278bd284..04b2bef05bc7b 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer,
     return NULLPTR_WRITE_ERROR;
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
-  int written = writer->get_chars_written();
+  size_t written = writer->get_chars_written();
 
   switch (to_conv.length_modifier) {
   case LengthModifier::none:
-    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written;
+    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written);
     break;
   case LengthModifier::l:
     *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written;
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 1d4734a51b9b8..9de108ece510f 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer {
 
 template <WriteMode write_mode> class Writer final {
   WriteBuffer<write_mode> &wb;
-  int chars_written = 0;
+  size_t chars_written = 0;
 
   LIBC_INLINE int pad(char new_char, size_t length) {
     // First, fill as much of the buffer as possible with the padding char.
@@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final {
   // Takes a string, copies it into the buffer if there is space, else passes it
   // to the overflow mechanism to be handled separately.
   LIBC_INLINE int write(cpp::string_view new_string) {
-    chars_written += static_cast<int>(new_string.size());
+    chars_written += new_string.size();
     if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) {
       inline_memcpy(wb.buff + wb.buff_cur, new_string.data(),
                     new_string.size());
@@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final {
   // if there is space, else calls pad which will loop and call the overflow
   // mechanism on a secondary buffer.
   LIBC_INLINE int write(char new_char, size_t length) {
-    chars_written += static_cast<int>(length);
+    chars_written += length;
 
     if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) {
       inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char),
@@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final {
     return wb.overflow_write(char_string_view);
   }
 
-  LIBC_INLINE int get_chars_written() { return chars_written; }
+  LIBC_INLINE size_t get_chars_written() { return chars_written; }
 };
 
 // Class-template auto deduction helpers.
diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp
index c8940862f711f..d95195f6f485f 100644
--- a/libc/src/stdio/snprintf.cpp
+++ b/libc/src/stdio/snprintf.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/snprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -32,10 +36,21 @@ LLVM_LIBC_FUNCTION(int, snprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp
index 7be97d3591aaf..2a9b6ea7c5e50 100644
--- a/libc/src/stdio/sprintf.cpp
+++ b/libc/src/stdio/sprintf.cpp
@@ -10,7 +10,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,9 +36,20 @@ LLVM_LIBC_FUNCTION(int, sprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
index 4a44d4a0f8842..bd77cd8864312 100644
--- a/libc/src/stdio/vasprintf.cpp
+++ b/libc/src/stdio/vasprintf.cpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vasprintf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -18,7 +22,17 @@ LLVM_LIBC_FUNCTION(int, vasprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  return printf_core::vasprintf_internal(ret, format, args);
+  auto ret_val = printf_core::vasprintf_internal(ret, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp
index b07a2499a0dd3..5d936360c0857 100644
--- a/libc/src/stdio/vsnprintf.cpp
+++ b/libc/src/stdio/vsnprintf.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/vsnprintf.h"
 
+#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -29,10 +33,21 @@ LLVM_LIBC_FUNCTION(int, vsnprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp
index 26d497be42125..f9cf8118534f6 100644
--- a/libc/src/stdio/vsprintf.cpp
+++ b/libc/src/stdio/vsprintf.cpp
@@ -10,7 +10,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -30,9 +33,19 @@ LLVM_LIBC_FUNCTION(int, vsprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  int ret_val = printf_core::printf_main(&writer, format, args);
+  auto ret_val = printf_core::printf_main(&writer, format, args);
+  if (!ret_val.has_value()) {
+    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
+    return -1;
+  }
   wb.buff[wb.buff_cur] = '\0';
-  return ret_val;
+
+  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(ret_val.value());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index c464f82dcbda7..1ccdcc8bec148 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -73,6 +73,8 @@ add_entrypoint_object(
     strfromf.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -83,6 +85,8 @@ add_entrypoint_object(
     strfromd.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -93,6 +97,8 @@ add_entrypoint_object(
     strfroml.h
   DEPENDS
     .str_from_util
+    libc.src.__support.CPP.limits
+    libc.src.stdio.printf_core.error_mapper
 )
 
 add_header_library(
diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp
index f51e6d4c7f1df..71e257f08645b 100644
--- a/libc/src/stdlib/strfromd.cpp
+++ b/libc/src/stdlib/strfromd.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromd.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromd,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() >
+      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp
index 14dbfdb25bab6..65f242b200f18 100644
--- a/libc/src/stdlib/strfromf.cpp
+++ b/libc/src/stdlib/strfromf.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromf.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,7 +39,13 @@ LLVM_LIBC_FUNCTION(int, strfromf,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() >
+      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp
index 12f22a8a2fb65..31668a0323c93 100644
--- a/libc/src/stdlib/strfroml.cpp
+++ b/libc/src/stdlib/strfroml.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfroml.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -41,7 +44,13 @@ LLVM_LIBC_FUNCTION(int, strfroml,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  return writer.get_chars_written();
+  if (writer.get_chars_written() >
+      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
+    libc_errno =
+        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
+    return -1;
+  }
+  return static_cast<int>(writer.get_chars_written());
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h
index c7e590627094a..2b136d83234cd 100644
--- a/libc/src/time/strftime_core/strftime_main.h
+++ b/libc/src/time/strftime_core/strftime_main.h
@@ -36,7 +36,8 @@ int strftime_main(printf_core::Writer<write_mode> *writer,
       return result;
   }
 
-  return writer->get_chars_written();
+  // TODO: Use ErrorOr<size_t>
+  return static_cast<int>(writer->get_chars_written());
 }
 
 } // namespace strftime_core
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index eec108bc12ca5..d71f1dff11943 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -186,6 +186,8 @@ add_libc_test(
     fprintf_test.cpp
   DEPENDS
     libc.src.stdio.fprintf
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
     ${fprintf_test_deps}
   COMPILE_OPTIONS
     ${use_system_file}
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 6799323cc6ad9..1b35a09645939 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -15,6 +15,9 @@
 
 #include "src/stdio/fprintf.h"
 
+#include "src/__support/CPP/limits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -31,6 +34,8 @@ using ::fread;
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace printf_test
 
+using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -78,6 +83,25 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written =
       LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
+  ASSERT_ERRNO_EQ(EBADF);
+
+  ASSERT_EQ(printf_test::fclose(file), 0);
+}
+
+#if !defined(LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS) &&                            \
+    !defined(LIBC_COPT_PRINTF_DISABLE_WRITE_INT)
+TEST(LlvmLibcFPrintfTest, NullPtrCheck) {
+  const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test");
+  auto FILE_PATH = libc_make_test_file_path(FILENAME);
+
+  ::FILE *file = printf_test::fopen(FILE_PATH, "w");
+  ASSERT_FALSE(file == nullptr);
+
+  int ret =
+      LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr));
+  EXPECT_LT(ret, 0);
+  ASSERT_ERRNO_EQ(EINVAL);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
+#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index bf088937e4104..2dae2a22c864c 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "abc");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "%");
-  ASSERT_EQ(writer.get_chars_written(), 1);
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "D");
-  ASSERT_EQ(writer.get_chars_written(), 1);
+  ASSERT_EQ(writer.get_chars_written(), size_t{1});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
@@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "   E");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "F   ");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
@@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "DEF");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
@@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "456");
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
@@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "xy");
-  ASSERT_EQ(writer.get_chars_written(), 2);
+  ASSERT_EQ(writer.get_chars_written(), size_t{2});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
@@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, " 789");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
@@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "ghi ");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
@@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "12345");
-  ASSERT_EQ(writer.get_chars_written(), 5);
+  ASSERT_EQ(writer.get_chars_written(), size_t{5});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
@@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x00000000123456ab");
-  ASSERT_EQ(writer.get_chars_written(), 18);
+  ASSERT_EQ(writer.get_chars_written(), size_t{18});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
@@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "101010");
-  ASSERT_EQ(writer.get_chars_written(), 6);
+  ASSERT_EQ(writer.get_chars_written(), size_t{6});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
@@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x123456ab");
-  ASSERT_EQ(writer.get_chars_written(), 10);
+  ASSERT_EQ(writer.get_chars_written(), size_t{10});
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
@@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "1234");
-  ASSERT_EQ(writer.get_chars_written(), 4);
+  ASSERT_EQ(writer.get_chars_written(), size_t{4});
 }
diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp
index d036341be7981..d263cf55aa474 100644
--- a/libc/test/src/stdio/printf_core/writer_test.cpp
+++ b/libc/test/src/stdio/printf_core/writer_test.cpp
@@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abc", str);
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
@@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF123", str);
-  ASSERT_EQ(writer.get_chars_written(), 9);
+  ASSERT_EQ(writer.get_chars_written(), size_t{9});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteChars) {
@@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaa", str);
-  ASSERT_EQ(writer.get_chars_written(), 3);
+  ASSERT_EQ(writer.get_chars_written(), size_t{3});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
@@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDDD111", str);
-  ASSERT_EQ(writer.get_chars_written(), 9);
+  ASSERT_EQ(writer.get_chars_written(), size_t{9});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
@@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
                "ZZZZZZZZZZ"
                "ZZZZZZZZZ",
                str);
-  ASSERT_EQ(writer.get_chars_written(), 99);
+  ASSERT_EQ(writer.get_chars_written(), size_t{99});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
@@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
@@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF1234", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
@@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("1111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), 15);
+  ASSERT_EQ(writer.get_chars_written(), size_t{15});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
@@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF1114", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
@@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
@@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
   writer.write('1', 3);
   writer.write({"456", 3});
 
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 struct OutBuff {
@@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("abcDEF123456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
@@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("111111111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), 15);
+  ASSERT_EQ(writer.get_chars_written(), size_t{15});
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
@@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
@@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
@@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
   wb.overflow_write("");
   str[out_buff.cur_pos] = '\0';
 
-  ASSERT_EQ(writer.get_chars_written(), 12);
+  ASSERT_EQ(writer.get_chars_written(), size_t{12});
   ASSERT_STREQ("aaaDEF111456", str);
 }
 
diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp
index baaa664cdc9ee..95507e0885dbf 100644
--- a/libc/test/src/stdio/snprintf_test.cpp
+++ b/libc/test/src/stdio/snprintf_test.cpp
@@ -8,8 +8,12 @@
 
 #include "src/stdio/snprintf.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 // The sprintf test cases cover testing the shared printf functionality, so
 // these tests will focus on snprintf exclusive features.
 
@@ -59,3 +63,14 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) {
   EXPECT_EQ(written, 10);
   ASSERT_STREQ(buff, "1234567890");
 }
+
+TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) {
+  char buff[0];
+
+  // Trigger an overflow in the return value of snprintf by writing more than
+  // INT_MAX bytes.
+  int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max();
+  int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, "");
+  EXPECT_LT(written, 0);
+  ASSERT_ERRNO_FAILURE();
+}
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index f50565a0f68ca..9b5f09db8fd41 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -19,6 +19,8 @@
 
 #include "src/stdio/vfprintf.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -44,6 +46,8 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format,
   return ret;
 }
 
+using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
 TEST(LlvmLibcVFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -90,6 +94,7 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) {
 
   written = call_vfprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
+  ASSERT_ERRNO_EQ(EBADF);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h
index e82c94499aa11..fd2e0f120e90e 100644
--- a/libc/test/src/stdlib/StrfromTest.h
+++ b/libc/test/src/stdlib/StrfromTest.h
@@ -8,6 +8,8 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str)             \
@@ -15,7 +17,7 @@
   EXPECT_STREQ(actual_str, expected_str);
 
 template <typename InputT>
-class StrfromTest : public LIBC_NAMESPACE::testing::Test {
+class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 
   static constexpr bool is_single_prec =
       LIBC_NAMESPACE::cpp::is_same<InputT, float>::value;
@@ -481,6 +483,16 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test {
     written = func(buff, 10, "%A", -ld_nan);
     ASSERT_STREQ_LEN(written, buff, "-NAN");
   }
+
+  void charsWrittenOverflow(FunctionT func) {
+    char buff[100];
+    // Trigger an overflow in the return value of strfrom by writing more than
+    // INT_MAX bytes.
+    int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f);
+
+    EXPECT_LT(result, 0);
+    ASSERT_ERRNO_FAILURE();
+  }
 };
 
 #define STRFROM_TEST(InputType, name, func)                                    \
@@ -501,4 +513,7 @@ class StrfromTest : public LIBC_NAMESPACE::testing::Test {
   TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) {                       \
     insufficentBufsize(func);                                                  \
   }                                                                            \
-  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }
+  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }        \
+  TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) {                         \
+    charsWrittenOverflow(func);                                                \
+  }

From 6d4e75cc931a82108526dafb645f68494eb45973 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 4 Nov 2025 16:21:37 -0800
Subject: [PATCH 250/313] [MISched][NFC] Rename isUnbufferedGroup to
 isReservedGroup (#166439)

In both ScheduleDAGInstrs and MachineScheduler, we call `BufferSize = 0`
as _reserved_ and `BufferSize = 1` as _unbuffered_. This convention is
stem from the fact that we set `SUnit::hasReservedResource` to true when
any of the SUnit's consumed resources has BufferSize equal to zero; set
`SUnit::isUnbuffered` to true when any of its consumed resources has
BufferSize equal to one.

However, `SchedBoundary::isUnbufferedGroup` doesn't really follow this
convention: it returns true when the resource in question is a
`ProcResGroup` and its BufferSize equals to **zero** rather than one.
This could be really confusing for the reader. This patch renames this
function to `isReservedGroup` in aligned with the convention mentioned
above.

NFC.
---
 llvm/include/llvm/CodeGen/MachineScheduler.h | 2 +-
 llvm/lib/CodeGen/MachineScheduler.cpp        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 6c5c27c9662e4..7b965d400ed08 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1038,7 +1038,7 @@ class SchedBoundary {
   getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
                        unsigned ReleaseAtCycle, unsigned AcquireAtCycle);
 
-  bool isUnbufferedGroup(unsigned PIdx) const {
+  bool isReservedGroup(unsigned PIdx) const {
     return SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin &&
            !SchedModel->getProcResource(PIdx)->BufferSize;
   }
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index f18c051142960..73993705c4a7b 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
     for (unsigned i = 0; i < ResourceCount; ++i) {
       ReservedCyclesIndex[i] = NumUnits;
       NumUnits += SchedModel->getProcResource(i)->NumUnits;
-      if (isUnbufferedGroup(i)) {
+      if (isReservedGroup(i)) {
         auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
         for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
              U != UE; ++U)
@@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
   assert(NumberOfInstances > 0 &&
          "Cannot have zero instances of a ProcResource");
 
-  if (isUnbufferedGroup(PIdx)) {
+  if (isReservedGroup(PIdx)) {
     // If any subunits are used by the instruction, report that the
     // subunits of the resource group are available at the first cycle
     // in which the unit is available, effectively removing the group

From fa5cd27ef0fc0b2c17a58407851e0aa0522d788b Mon Sep 17 00:00:00 2001
From: Jin Huang <jinhuang1102@gmail.com>
Date: Tue, 4 Nov 2025 16:23:34 -0800
Subject: [PATCH 251/313] [profcheck] Add unknown branch weights to expand
 LL/SR loop. (#166273)

As a follow-up to PR#165841, this change addresses `prof_md` metadata
loss in AtomicExpandPass when lowering `atomicrmw xchg` to a
Load-Linked/Store-Exclusive (LL/SC) loop.

This path is distinct from the LSE path addressed previously:

PR #165841 (and its tests) used `-mtriple=aarch64-linux-gnu`, which
targets a modern **ARMv8.1+** architecture. This architecture supports
**Large System Extensions (LSE)**, allowing `atomicrmw` to be lowered
directly to a more efficient hardware instruction.

This PR (and its tests) uses `-mtriple=aarch64--` or
`-mtriple=armv8-linux-gnueabihf`. This indicates an `ARMv8.0 or lower
architecture that does not support LSE`. On these targets, the pass must
fall back to synthesizing a manual LL/SC loop using the `ldaxr/stxr`
instruction pair.

Similar to previous issue, the new conditional branch was failin to
inherit the `prof_md` metadata. Theis PR correctly fix the branch
weights to the newly created branch within the LL/SC loop, ensuring
profile information is preserved.

Co-authored-by: Jin Huang <jingold@google.com>
---
 llvm/lib/CodeGen/AtomicExpandPass.cpp         | 12 ++++++--
 .../AArch64/expand-atomicrmw-xchg-fp.ll       | 28 ++++++++++++++-----
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..6412949948c07 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
   BasicBlock *BB = Builder.GetInsertBlock();
   Function *F = BB->getParent();
 
-  assert(AddrAlign >=
-             F->getDataLayout().getTypeStoreSize(ResultTy) &&
+  assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
          "Expected at least natural alignment at this point.");
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
@@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
   Value *TryAgain = Builder.CreateICmpNE(
       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
+  // hard to predict precise branch weigths we mark the branch as "unknown"
+  // (50/50) to prevent misleading optimizations.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return Loaded;
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
index 95a52aa0f7f52..b509b2469cfdc 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s
 ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
 
-define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
+define void @atomic_swap_f16(ptr %ptr, half %val) !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -12,7 +12,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half
 ; CHECK-NEXT:    ret void
@@ -27,7 +27,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
   ret void
 }
 
-define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
+define void @atomic_swap_f32(ptr %ptr, float %val) nounwind !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -37,7 +37,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float
 ; CHECK-NEXT:    ret void
@@ -52,7 +52,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
   ret void
 }
 
-define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
+define void @atomic_swap_f64(ptr %ptr, double %val) nounwind !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -60,7 +60,7 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double
 ; CHECK-NEXT:    ret void
@@ -74,3 +74,17 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
   %t1 = atomicrmw xchg ptr %ptr, double %val acquire
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn }
+;.
+; OUTLINE-ATOMICS: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+outline-atomics" }
+; OUTLINE-ATOMICS: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-features"="+outline-atomics" }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
+; OUTLINE-ATOMICS: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+;.

From 3ebed51e997b2a517eecb53a78cc0b6ffcdc00a3 Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Tue, 4 Nov 2025 16:25:33 -0800
Subject: [PATCH 252/313] [Clang][LLDB] Refactor trap reason demangling out of
 LLDB and into Clang (#165996)

This patch refactors the trap reason demangling logic in
`lldb_private::VerboseTrapFrameRecognizer::RecognizeFrame` into a new
public function `clang::CodeGen::DemangleTrapReasonInDebugInfo`.

There are two reasons for doing this:

1. In a future patch the logic for demangling needs to be used somewhere
else in LLDB and thus the logic needs refactoring to avoid duplicating
code.
2. The logic for demangling shouldn't really be in LLDB anyway because
it's a Clang implementation detail and thus the logic really belongs
inside Clang, not LLDB.

Unit tests have been added for the new function that demonstrate how to
use the new API.

The function names recognized by VerboseTrapFrameRecognizer are
identical to before. However, this patch isn't NFC because:

* The `lldbTarget` library now links against `clangCodeGen` which it
didn't previously.
* The LLDB logging output is a little different now. The previous code
tried to log failures for an invalid regex pattern and for the
`Regex::match` API not returning the correct number of matches. These
failure conditions are unreachable via unit testing so they have been
made assertions failures inside the `DemangleTrapReasonInDebugInfo`
implementation instead of trying to log them in LLDB.

rdar://163230807
---
 clang/include/clang/CodeGen/ModuleBuilder.h   | 17 +++++
 clang/lib/CodeGen/ModuleBuilder.cpp           | 29 ++++++++
 clang/unittests/CodeGen/CMakeLists.txt        |  1 +
 .../CodeGen/DemangleTrapReasonInDebugInfo.cpp | 67 +++++++++++++++++++
 .../LanguageRuntime/CPlusPlus/CMakeLists.txt  |  2 +
 .../CPlusPlus/VerboseTrapFrameRecognizer.cpp  | 31 ++-------
 6 files changed, 122 insertions(+), 25 deletions(-)
 create mode 100644 clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp

diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
index f1b8229edd362..4298ba06c472e 100644
--- a/clang/include/clang/CodeGen/ModuleBuilder.h
+++ b/clang/include/clang/CodeGen/ModuleBuilder.h
@@ -120,6 +120,23 @@ CodeGenerator *CreateLLVMCodeGen(DiagnosticsEngine &Diags,
                                  llvm::LLVMContext &C,
                                  CoverageSourceInfo *CoverageInfo = nullptr);
 
+namespace CodeGen {
+/// Demangle the artificial function name (\param FuncName) used to encode trap
+/// reasons used in debug info for traps (e.g. __builtin_verbose_trap). See
+/// `CGDebugInfo::CreateTrapFailureMessageFor`.
+///
+/// \param FuncName - The function name to demangle.
+///
+/// \return A std::optional. If demangling succeeds the optional will contain
+/// a pair of StringRefs where the first field is the trap category and the
+/// second is the trap message. These can both be empty. If demangling fails the
+/// optional will not contain a value. Note the returned StringRefs if non-empty
+/// point into the underlying storage for \param FuncName and thus have the same
+/// lifetime.
+std::optional<std::pair<StringRef, StringRef>>
+DemangleTrapReasonInDebugInfo(StringRef FuncName);
+} // namespace CodeGen
+
 } // end namespace clang
 
 #endif
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 96f3f6221e20f..8ec8aef311656 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include <memory>
 
@@ -378,3 +379,31 @@ clang::CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::StringRef ModuleName,
                                HeaderSearchOpts, PreprocessorOpts, CGO, C,
                                CoverageInfo);
 }
+
+namespace clang {
+namespace CodeGen {
+std::optional<std::pair<StringRef, StringRef>>
+DemangleTrapReasonInDebugInfo(StringRef FuncName) {
+  static auto TrapRegex =
+      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
+  llvm::SmallVector<llvm::StringRef, 3> Matches;
+  std::string *ErrorPtr = nullptr;
+#ifndef NDEBUG
+  std::string Error;
+  ErrorPtr = &Error;
+#endif
+  if (!TrapRegex.match(FuncName, &Matches, ErrorPtr)) {
+    assert(ErrorPtr && ErrorPtr->empty() && "Invalid regex pattern");
+    return {};
+  }
+
+  if (Matches.size() != 3) {
+    assert(0 && "Expected 3 matches from Regex::match");
+    return {};
+  }
+
+  // Returns { Trap Category, Trap Message }
+  return std::make_pair(Matches[1], Matches[2]);
+}
+} // namespace CodeGen
+} // namespace clang
diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt
index f5bcecb0b08a3..d4efb2230a054 100644
--- a/clang/unittests/CodeGen/CMakeLists.txt
+++ b/clang/unittests/CodeGen/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_clang_unittest(ClangCodeGenTests
   BufferSourceTest.cpp
   CodeGenExternalTest.cpp
+  DemangleTrapReasonInDebugInfo.cpp
   TBAAMetadataTest.cpp
   CheckTargetFeaturesTest.cpp
   CLANG_LIBS
diff --git a/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
new file mode 100644
index 0000000000000..17bfe17c31d65
--- /dev/null
+++ b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
@@ -0,0 +1,67 @@
+//=== unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/CodeGen/ModuleBuilder.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+
+using namespace clang::CodeGen;
+
+void CheckValidCommon(llvm::StringRef FuncName, const char *ExpectedCategory,
+                      const char *ExpectedMessage) {
+  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
+  ASSERT_TRUE(MaybeTrapReason.has_value());
+  auto [Category, Message] = MaybeTrapReason.value();
+  ASSERT_STREQ(Category.str().c_str(), ExpectedCategory);
+  ASSERT_STREQ(Message.str().c_str(), ExpectedMessage);
+}
+
+void CheckInvalidCommon(llvm::StringRef FuncName) {
+  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
+  ASSERT_TRUE(!MaybeTrapReason.has_value());
+}
+
+TEST(DemangleTrapReasonInDebugInfo, Valid) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$trap category$trap message";
+  CheckValidCommon(FuncName, "trap category", "trap message");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidEmptyCategory) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$$trap message";
+  CheckValidCommon(FuncName, "", "trap message");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidEmptyMessage) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$trap category$";
+  CheckValidCommon(FuncName, "trap category", "");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidAllEmpty) {
+  //  `__builtin_verbose_trap` actually allows this
+  // currently. However, we should probably disallow this in Sema because having
+  // an empty category and message completely defeats the point of using the
+  // builtin (#165981).
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$$";
+  CheckValidCommon(FuncName, "", "");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, InvalidOnlyPrefix) {
+  std::string FuncName(ClangTrapPrefix);
+  CheckInvalidCommon(FuncName);
+}
+
+TEST(DemangleTrapReasonInDebugInfo, Invalid) {
+  std::string FuncName("foo");
+  CheckInvalidCommon(FuncName);
+}
+
+TEST(DemangleTrapReasonInDebugInfo, InvalidEmpty) { CheckInvalidCommon(""); }
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
index a27bceffe2e3a..727c8290bceb4 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
@@ -6,6 +6,8 @@ add_lldb_library(lldbPluginCPPRuntime
     lldbCore
     lldbSymbol
     lldbTarget
+  CLANG_LIBS
+    clangCodeGen
 )
 
 add_subdirectory(ItaniumABI)
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
index 730aba5b42a3e..2b6bf2cd470e6 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
@@ -95,33 +95,14 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) {
   if (func_name.empty())
     return {};
 
-  static auto trap_regex =
-      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
-  SmallVector<llvm::StringRef, 3> matches;
-  std::string regex_err_msg;
-  if (!trap_regex.match(func_name, &matches, &regex_err_msg)) {
-    LLDB_LOGF(GetLog(LLDBLog::Unwind),
-              "Failed to parse match trap regex for '%s': %s", func_name.data(),
-              regex_err_msg.c_str());
-
-    return {};
-  }
-
-  // For `__clang_trap_msg$category$message$` we expect 3 matches:
-  // 1. entire string
-  // 2. category
-  // 3. message
-  if (matches.size() != 3) {
-    LLDB_LOGF(GetLog(LLDBLog::Unwind),
-              "Unexpected function name format. Expected '<trap prefix>$<trap "
-              "category>$<trap message>'$ but got: '%s'.",
-              func_name.data());
-
+  auto maybe_trap_reason =
+      clang::CodeGen::DemangleTrapReasonInDebugInfo(func_name);
+  if (!maybe_trap_reason.has_value()) {
+    LLDB_LOGF(GetLog(LLDBLog::Unwind), "Failed to demangle '%s' as trap reason",
+              func_name.str().c_str());
     return {};
   }
-
-  auto category = matches[1];
-  auto message = matches[2];
+  auto [category, message] = maybe_trap_reason.value();
 
   std::string stop_reason =
       category.empty() ? "<empty category>" : category.str();

From d54793113798f74424f21b94fe74bd675a11e801 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 4 Nov 2025 16:39:12 -0800
Subject: [PATCH 253/313] [SLU][profcheck] create likely branch weights for
 guard->branch (#164271)

The `llvm.experimental.guard` intrinsic is a `call`, so its metadata - if present - would be one value (as per `Verifier::visitProfMetadata`). That wouldn't be a correct `branch_weights` metadata. Likely, `GI->getMetadata(LLVMContext::MD_prof)` was always `nullptr`.

We can bias away from deopt instead.

Issue #147390
---
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  | 12 ++++++---
 .../Transforms/SimpleLoopUnswitch/guards.ll   | 26 ++++++++++++-------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 239526e85e1fd..86b2090081ed0 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
@@ -2831,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
      MSSAU->getMemorySSA()->verifyMemorySSA();
 
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  Instruction *DeoptBlockTerm =
-      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-                                GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
+  // llvm.experimental.guard doesn't have branch weights. We can assume,
+  // however, that the deopt path is unlikely.
+  Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+      GI->getArgOperand(0), GI, true,
+      !ProfcheckDisableMetadataFixes && EstimateProfile
+          ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+          : nullptr,
+      &DTU, &LI);
   BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
   // SplitBlockAndInsertIfThen inserts control flow that branches to
   // DeoptBlockTerm if the condition is true.  We want the opposite.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 706b49df14749..42b32e769d8d7 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,15 +1,15 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards  -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1, ...)
 
-define void @test_simple_case(i1 %cond, i32 %N) {
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
 ; CHECK-LABEL: define void @test_simple_case(
-; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       [[ENTRY_SPLIT_US]]:
 ; CHECK-NEXT:    br label %[[LOOP_US:.*]]
 ; CHECK:       [[LOOP_US]]:
@@ -50,9 +50,9 @@ define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
 ; CHECK-LABEL: define void @test_two_guards(
 ; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
 ; CHECK:       [[ENTRY_SPLIT_US]]:
-; CHECK-NEXT:    br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]]
 ; CHECK:       [[ENTRY_SPLIT_US_SPLIT_US]]:
 ; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
 ; CHECK:       [[LOOP_US_US]]:
@@ -108,7 +108,7 @@ define void @test_conditional_guards(i1 %cond, i32 %N) {
 ; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
-; CHECK-NEXT:    br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
 ; CHECK:       [[ENTRY_SPLIT_US]]:
 ; CHECK-NEXT:    br label %[[LOOP_US:.*]]
 ; CHECK:       [[LOOP_US]]:
@@ -171,7 +171,7 @@ define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
 ; CHECK-LABEL: define void @test_nested_loop(
 ; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]], !prof [[PROF1]]
 ; CHECK:       [[ENTRY_SPLIT]]:
 ; CHECK-NEXT:    br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]]
 ; CHECK:       [[ENTRY_SPLIT_SPLIT_US]]:
@@ -243,7 +243,7 @@ define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
 ; CHECK-LABEL: define void @test_sibling_loops(
 ; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
 ; CHECK:       [[ENTRY_SPLIT_US]]:
 ; CHECK-NEXT:    br label %[[LOOP1_US:.*]]
 ; CHECK:       [[LOOP1_US]]:
@@ -263,7 +263,7 @@ define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BETWEEN]]:
-; CHECK-NEXT:    br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]]
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]], !prof [[PROF1]]
 ; CHECK:       [[BETWEEN_SPLIT_US2]]:
 ; CHECK-NEXT:    br label %[[LOOP2_US:.*]]
 ; CHECK:       [[LOOP2_US]]:
@@ -343,3 +343,9 @@ exit:
 
 declare void @may_throw(i32 %i)
 declare i32 @__CxxFrameHandler3(...)
+
+!0 = !{!"function_entry_count", i32 10}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1048575, i32 1}
+;.

From 02f5a1aa07f1654a28d075dec6fc0ac5da0801b6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 4 Nov 2025 16:39:26 -0800
Subject: [PATCH 254/313] [ADT] Introduce Casting function objects (#165803)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding casting function objects as a convenience for expressing e.g. `auto AsDoubles = map_range(RangeOfInts, StaticCastTo<double>)`​
---
 llvm/include/llvm/Support/Casting.h | 50 +++++++++++++++++++++++++++++
 llvm/unittests/Support/Casting.cpp  | 41 +++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 6f6df2e9703ea..a6435a2562a2b 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -816,6 +816,42 @@ template <typename... Types> struct IsaAndPresentCheckPredicate {
     return isa_and_present<Types...>(Val);
   }
 };
+
+//===----------------------------------------------------------------------===//
+// Casting Function Objects
+//===----------------------------------------------------------------------===//
+
+/// Usable in generic algorithms like map_range
+template <typename U> struct StaticCastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return static_cast<U>(Val);
+  }
+};
+
+template <typename U> struct DynCastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return dyn_cast<U>(Val);
+  }
+};
+
+template <typename U> struct CastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return cast<U>(Val);
+  }
+};
+
+template <typename U> struct CastIfPresentFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return cast_if_present<U>(Val);
+  }
+};
+
+template <typename U> struct DynCastIfPresentFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return dyn_cast_if_present<U>(Val);
+  }
+};
+
 } // namespace detail
 
 /// Function object wrapper for the `llvm::isa` type check. The function call
@@ -841,6 +877,20 @@ template <typename... Types>
 inline constexpr detail::IsaAndPresentCheckPredicate<Types...>
     IsaAndPresentPred{};
 
+/// Function objects corresponding to the Cast types defined above.
+template <typename From>
+inline constexpr detail::StaticCastFunc<From> StaticCastTo{};
+
+template <typename From> inline constexpr detail::CastFunc<From> CastTo{};
+
+template <typename From>
+inline constexpr detail::CastIfPresentFunc<From> CastIfPresentTo{};
+
+template <typename From>
+inline constexpr detail::DynCastIfPresentFunc<From> DynCastIfPresentTo{};
+
+template <typename From> inline constexpr detail::DynCastFunc<From> DynCastTo{};
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_CASTING_H
diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp
index 790675083614b..0df8b9fcab452 100644
--- a/llvm/unittests/Support/Casting.cpp
+++ b/llvm/unittests/Support/Casting.cpp
@@ -561,6 +561,47 @@ TEST(CastingTest, assertion_check_unique_ptr) {
       << "Invalid cast of const ref did not cause an abort()";
 }
 
+TEST(Casting, StaticCastPredicate) {
+  uint32_t Value = 1;
+
+  static_assert(
+      std::is_same_v<decltype(StaticCastTo<uint64_t>(Value)), uint64_t>);
+}
+
+TEST(Casting, LLVMRTTIPredicates) {
+  struct Base {
+    enum Kind { BK_Base, BK_Derived };
+    const Kind K;
+    Base(Kind K = BK_Base) : K(K) {}
+    Kind getKind() const { return K; }
+    virtual ~Base() = default;
+  };
+
+  struct Derived : Base {
+    Derived() : Base(BK_Derived) {}
+    static bool classof(const Base *B) { return B->getKind() == BK_Derived; }
+    bool Field = false;
+  };
+
+  Base B;
+  Derived D;
+  Base *BD = &D;
+  Base *Null = nullptr;
+
+  // Pointers.
+  EXPECT_EQ(DynCastTo<Derived>(BD), &D);
+  EXPECT_EQ(CastTo<Derived>(BD), &D);
+  EXPECT_EQ(DynCastTo<Derived>(&B), nullptr);
+  EXPECT_EQ(CastIfPresentTo<Derived>(BD), &D);
+  EXPECT_EQ(CastIfPresentTo<Derived>(Null), nullptr);
+  EXPECT_EQ(DynCastIfPresentTo<Derived>(BD), &D);
+  EXPECT_EQ(DynCastIfPresentTo<Derived>(Null), nullptr);
+
+  Base &R = D;
+  CastTo<Derived>(R).Field = true;
+  EXPECT_TRUE(D.Field);
+}
+
 } // end namespace assertion_checks
 #endif
 } // end namespace

From 4209e41c4c1ba7c9595360c4c0ab3e85d13000f2 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Tue, 4 Nov 2025 16:42:11 -0800
Subject: [PATCH 255/313] [libc] Fix printf long double bugs (#166474)

Found in testing against abseil. Two bugs were found:

1) SHIFT_AMOUNT in float_converter<long double> would sometimes be
   negative causing an underflow when passed as the amount to left shift
   by for BigInt.
2) is_lowest_block had an off-by-one because it was adding 1 to the
   block index.

Both are fixed and there are new tests to catch regressions.
---
 libc/src/__support/float_to_string.h |  8 ++++++--
 libc/test/src/stdio/sprintf_test.cpp | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index cab146a5b8698..9115ed2856881 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -700,7 +700,11 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed <<= SHIFT_AMOUNT;
+      if (SHIFT_AMOUNT > 0) {
+        float_as_fixed <<= SHIFT_AMOUNT;
+      } else {
+        float_as_fixed >>= -SHIFT_AMOUNT;
+      }
 
       // If there are still digits above the decimal point, handle those.
       if (cpp::countl_zero(float_as_fixed) <
@@ -769,7 +773,7 @@ template <> class FloatToString<long double> {
     // The decimal representation of 2**(-i) will have exactly i digits after
     // the decimal point.
     const int num_requested_digits =
-        static_cast<int>((negative_block_index + 1) * BLOCK_SIZE);
+        static_cast<int>(negative_block_index * BLOCK_SIZE);
 
     return num_requested_digits > -exponent;
   }
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index f1b545ba546f9..42fdd59cf4d9c 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -1537,6 +1537,14 @@ TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) {
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80)
 
 #ifndef LIBC_COPT_FLOAT_TO_STR_REDUCED_PRECISION
+  written = LIBC_NAMESPACE::sprintf(
+      buff, "%.75Lf",
+      0.0833333333333333333355920878593448009041821933351457118988037109375L);
+  ASSERT_STREQ_LEN(written, buff,
+                   "0."
+                   "08333333333333333333559208785934480090418219333514571189880"
+                   "3710937500000000");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L);
   ASSERT_STREQ_LEN(written, buff,
                    "99999999999999999996693535322073426194986990198284960792713"
@@ -2976,6 +2984,10 @@ TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L);
   ASSERT_STREQ_LEN(written, buff, "1.18973e+4932");
 
+  // Minimum normal
+  written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 3.36210314311209350626E-4932L);
+  ASSERT_STREQ_LEN(written, buff, "3.3621e-4932");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L);
   ASSERT_STREQ_LEN(written, buff, "0.0833333");
 

From 2b60b6d9639c240ebafc8b517425453e50e14a7b Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 4 Nov 2025 16:46:09 -0800
Subject: [PATCH 256/313] [llvm][mustache] Avoid extra allocations in
 parseSection (#159199)

We don't need to have extra allocations when concatenating raw bodies.
---
 llvm/lib/Support/Mustache.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 9eb1ec2b8425c..6c140be59fc4b 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -599,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
   size_t Start = CurrentPtr;
   parseMustache(CurrentNode);
   const size_t End = CurrentPtr - 1;
+
+  size_t RawBodySize = 0;
+  for (size_t I = Start; I < End; ++I)
+    RawBodySize += Tokens[I].RawBody.size();
+
   SmallString<128> RawBody;
-  for (std::size_t I = Start; I < End; I++)
+  RawBody.reserve(RawBodySize);
+  for (std::size_t I = Start; I < End; ++I)
     RawBody += Tokens[I].RawBody;
+
   CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
   Parent->addChild(CurrentNode);
 }

From b296e8fdf847f0041455e495b13ffb7aa6592e56 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Wed, 5 Nov 2025 00:50:14 +0000
Subject: [PATCH 257/313] [ProfCheck] Disable X86 AMX Test Case

4776451693f4a6bd18e50106edb4b3cfa766484f broke this because it started
running an existing pass using the NewPM, which caused ProfCheck to
catch existing issues. Disable it for now because we have not started
looking at anything in the Codegen pipeline. This pass is also only
enabled at O0 or if a function has optnone, so not super critical.
---
 llvm/utils/profcheck-xfail.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index d7af3a7ecbdee..53757c73fb8a6 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -75,6 +75,7 @@ CodeGen/Hexagon/masked_gather.ll
 CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/RISCV/zmmul.ll
 CodeGen/WebAssembly/memory-interleave.ll
+CodeGen/X86/AMX/amx-low-intrinsics.ll
 CodeGen/X86/masked_gather_scatter.ll
 CodeGen/X86/nocfivalue.ll
 DebugInfo/AArch64/ir-outliner.ll

From 15b19c732172d2d3cfbc108a268cbba03eed734f Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Tue, 4 Nov 2025 16:56:00 -0800
Subject: [PATCH 258/313] [libc] Fix fprintf_test assuming specific errnos.
 (#166479)

The patch #166382 fixed most of these, but missed the fprintf_test ones.
---
 libc/test/src/stdio/fprintf_test.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 1b35a09645939..2cea7f554ce38 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -83,7 +83,7 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written =
       LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
-  ASSERT_ERRNO_EQ(EBADF);
+  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
@@ -100,7 +100,7 @@ TEST(LlvmLibcFPrintfTest, NullPtrCheck) {
   int ret =
       LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr));
   EXPECT_LT(ret, 0);
-  ASSERT_ERRNO_EQ(EINVAL);
+  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }

From 1d0aa6c2ad62f0bb6cd2ddbde07251c5702d29f4 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 4 Nov 2025 17:01:25 -0800
Subject: [PATCH 259/313] [BOLT] Fix impute-fall-throughs (#166305)

BOLT expects pre-aggregated profile entries to be unique, which holds
for externally aggregated traces (or branches+fall-through ranges).
Therefore, BOLT doesn't merge duplicate entries for faster processing.
However, such traces are not expressly prohibited and could come from
concatenated pre-aggregated profiles or otherwise.

Relax the assumption about no duplicate (branch-only) traces in fall-
through imputing.

Test Plan: updated callcont-fallthru.s
---
 bolt/lib/Profile/DataAggregator.cpp | 14 +++++++-------
 bolt/test/X86/callcont-fallthru.s   |  9 +++++++++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 4e062038a3e4c..8554683bc3cf8 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -564,13 +564,18 @@ void DataAggregator::imputeFallThroughs() {
     // Skip fall-throughs in external code.
     if (Trace.From == Trace::EXTERNAL)
       continue;
-    std::pair CurrentBranch(Trace.Branch, Trace.From);
+    if (std::pair CurrentBranch(Trace.Branch, Trace.From);
+        CurrentBranch != PrevBranch) {
+      // New group: reset aggregates.
+      AggregateCount = AggregateFallthroughSize = 0;
+      PrevBranch = CurrentBranch;
+    }
     // BR_ONLY must be the last trace in the group
     if (Trace.To == Trace::BR_ONLY) {
       // If the group is not empty, use aggregate values, otherwise 0-length
       // for unconditional jumps (call/ret/uncond branch) or 1-length for others
       uint64_t InferredBytes =
-          PrevBranch == CurrentBranch
+          AggregateFallthroughSize
               ? AggregateFallthroughSize / AggregateCount
               : !checkUnconditionalControlTransfer(Trace.From);
       Trace.To = Trace.From + InferredBytes;
@@ -578,16 +583,11 @@ void DataAggregator::imputeFallThroughs() {
                         << " bytes)\n");
       ++InferredTraces;
     } else {
-      // Trace with a valid fall-through
-      // New group: reset aggregates.
-      if (CurrentBranch != PrevBranch)
-        AggregateCount = AggregateFallthroughSize = 0;
       // Only use valid fall-through lengths
       if (Trace.To != Trace::EXTERNAL)
         AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount;
       AggregateCount += Info.TakenCount;
     }
-    PrevBranch = CurrentBranch;
   }
   if (opts::Verbosity >= 1)
     outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n";
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 8c05491e7bca0..ef0bb55df1faf 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -15,6 +15,8 @@
 # External return to a landing pad/entry point call continuation
 # RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET
 # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
+## Fall-through imputing test cases
+# RUN: link_fdata %s %t %t.pa-imp PREAGG-IMP
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
@@ -63,6 +65,11 @@
 # RUN-DISABLED:   --check-prefix=CHECK-PLT
 # CHECK-PLT: traces mismatching disassembled function contents: 0
 
+## Check --impute-trace-fall-throughs accepting duplicate branch-only traces
+# RUN: perf2bolt %t --pa -p %t.pa-imp -o %t.pa-imp.fdata --impute-trace-fall-through
+# RUN: FileCheck %s --check-prefix=CHECK-IMP --input-file %t.pa-imp.fdata
+# CHECK-IMP: 0 [unknown] 0 1 main {{.*}} 0 3
+
   .globl foo
   .type foo, %function
 foo:
@@ -102,6 +109,8 @@ Ltmp1:
 
 Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
+# PREAGG-IMP: B X:0 #Ltmp4_br# 1 0
+# PREAGG-IMP: B X:0 #Ltmp4_br# 2 0
 Ltmp4_br:
 	je	Ltmp0
 

From 397415792929e09e07a65ffadc265966cc04db32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 4 Nov 2025 17:21:37 -0800
Subject: [PATCH 260/313] AMDGPU: Pre-commit a test (#166414)

---
 llvm/test/CodeGen/AMDGPU/finalizebundle.mir | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 0548bcf304c32..d2ec1fcbac84f 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -34,3 +34,16 @@ body: |
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
     dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
 ...
+
+---
+name: test_tied
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_tied
+    ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32, implicit $mode, implicit $exec {
+    ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+    ; CHECK-NEXT:   [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec
+    ; CHECK-NEXT: }
+    %1:vgpr_32 = COPY %0:vgpr_32
+    %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
+...

From 1458d313a1cac6d2b255378e2b816092c6cbc4c7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 4 Nov 2025 17:23:55 -0800
Subject: [PATCH 261/313] [SLU][profcheck] Propagate profile for branches on
 injected conditions. (#164476)

This patch addresses the profile of 2 branches:

- one that compares the 2 limits, for which we have no information (the C1, C2, see https://reviews.llvm.org/D136233)
- one that is conditioned on a condition for which we have a profile, so we reuse it



Issue #147390
---
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |   9 +-
 .../inject-invariant-conditions.ll            | 142 +++++++++---------
 2 files changed, 79 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 86b2090081ed0..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3203,10 +3203,15 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
   Builder.SetInsertPoint(TI);
   auto *InvariantBr =
       Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+  // We don't know anything about the relation between the limits.
+  setExplicitlyUnknownBranchWeightsIfProfiled(
+      *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
 
   Builder.SetInsertPoint(CheckBlock);
-  Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
-                       TI->getSuccessor(1));
+  Builder.CreateCondBr(
+      TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+      !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+                                     : nullptr);
   TI->eraseFromParent();
 
   // Fixup phis.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
 ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s
 ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s
 
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
 ; CHECK-LABEL: @test_01(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
+; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
-; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       guarded.us:
-; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
-; CHECK-NEXT:    store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[EL_PTR1:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV1]]
+; CHECK-NEXT:    [[EL1:%.*]] = load i32, ptr [[EL_PTR1]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK1:%.*]] = icmp ult i32 [[EL1]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK1]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
-; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL1]], [[X]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
-; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL1]]
+; CHECK-NEXT:    store i32 [[IV1]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV1]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -76,7 +76,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_void_profile(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
@@ -133,7 +133,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_constants(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 200, 300
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -141,7 +141,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -154,13 +154,13 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -200,17 +200,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_degenerate_profile(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -257,17 +257,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_cold(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -314,17 +314,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_overflowing_metadata(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF9:![0-9]+]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -371,7 +371,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_02(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -379,7 +379,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -392,16 +392,16 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -441,7 +441,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_02_inverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -449,7 +449,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp uge i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -462,16 +462,16 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp uge i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -511,7 +511,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_03(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -519,7 +519,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -532,16 +532,16 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -581,7 +581,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_04(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]]
@@ -603,17 +603,17 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL_WIDE]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -651,17 +651,19 @@ range_check_failed:                               ; preds = %guarded
   ret i32 -2
 }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1}
-; CHECK: [[LOOP2]] = distinct !{!2, !3}
-; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"}
-; CHECK: [[LOOP4]] = distinct !{!4, !3}
-; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3}
-; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000}
-; CHECK: [[LOOP8]] = distinct !{!8, !3}
-; CHECK: [[LOOP9]] = distinct !{!9, !3}
-; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100}
-; CHECK: [[LOOP11]] = distinct !{!11, !3}
-; CHECK: [[LOOP12]] = distinct !{!12, !3}
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[META1]] = !{}
+; CHECK: [[PROF2]] = !{!"unknown", !"simple-loop-unswitch"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 100, i32 1}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]}
+; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[PROF8]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF9]] = !{!"branch_weights", i32 -1, i32 -1000}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]]}
+; CHECK: [[PROF11]] = !{!"branch_weights", i32 1, i32 100}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]}
 ;.

From 4b367e0b85aa2d56f58c3e85f9a330019254f6a9 Mon Sep 17 00:00:00 2001
From: Kleis Auke Wolthuizen <github@kleisauke.nl>
Date: Wed, 5 Nov 2025 02:35:15 +0100
Subject: [PATCH 262/313] [WebAssembly] Use IRBuilder in FixFunctionBitcasts
 (NFC) (#164268)

Simplifies the code a bit.
---
 .../WebAssemblyFixFunctionBitcasts.cpp        | 27 +++++++------------
 .../unsupported-function-bitcasts.ll          | 12 ++++-----
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 37a34573bb339..9fef3e6d8b089 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -24,6 +24,7 @@
 
 #include "WebAssembly.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
   Wrapper->setAttributes(F->getAttributes());
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
   const DataLayout &DL = BB->getDataLayout();
+  IRBuilder<> Builder(BB);
 
   // Determine what arguments to pass.
   SmallVector<Value *, 4> Args;
@@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
       Args.push_back(&*AI);
     } else {
       if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) {
-        Instruction *PtrCast =
-            CastInst::CreateBitOrPointerCast(AI, ParamType, "cast");
-        PtrCast->insertInto(BB, BB->end());
-        Args.push_back(PtrCast);
+        Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast"));
       } else if (ArgType->isStructTy() || ParamType->isStructTy()) {
         LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: "
                           << F->getName() << "\n");
@@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
       for (; AI != AE; ++AI)
         Args.push_back(&*AI);
 
-    CallInst *Call = CallInst::Create(F, Args, "", BB);
+    CallInst *Call = Builder.CreateCall(F, Args);
 
-    Type *ExpectedRtnType = F->getFunctionType()->getReturnType();
-    Type *RtnType = Ty->getReturnType();
     // Determine what value to return.
     if (RtnType->isVoidTy()) {
-      ReturnInst::Create(M->getContext(), BB);
+      Builder.CreateRetVoid();
     } else if (ExpectedRtnType->isVoidTy()) {
       LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n");
-      ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB);
+      Builder.CreateRet(PoisonValue::get(RtnType));
     } else if (RtnType == ExpectedRtnType) {
-      ReturnInst::Create(M->getContext(), Call, BB);
+      Builder.CreateRet(Call);
     } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType,
                                                     DL)) {
-      Instruction *Cast =
-          CastInst::CreateBitOrPointerCast(Call, RtnType, "cast");
-      Cast->insertInto(BB, BB->end());
-      ReturnInst::Create(M->getContext(), Cast, BB);
+      Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast"));
     } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) {
       LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: "
                         << F->getName() << "\n");
@@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
     Wrapper = Function::Create(Ty, Function::PrivateLinkage,
                                F->getName() + "_bitcast_invalid", M);
     Wrapper->setAttributes(F->getAttributes());
-    BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
-    new UnreachableInst(M->getContext(), BB);
-    Wrapper->setName(F->getName() + "_bitcast_invalid");
+    IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper));
+    Builder.CreateUnreachable();
   } else if (!WrapperNeeded) {
     LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName()
                       << "\n");
diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index 9c638199bb6e6..1cfda8a821bd6 100644
--- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr)
 
 ; CHECK-LABEL: test_invalid_rtn:
 ; CHECK:      i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}}
 ; CHECK-NEXT: drop $pop[[L1]]{{$}}
 ; CHECK-NEXT: i64.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}}
@@ -32,7 +32,7 @@ define void @test_struct_rtn() {
 
 ; CHECK-LABEL: test_invalid_arg:
 ; CHECK:      	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
-; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}}
+; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
 ; CHECK-NEXT: 	drop	$pop[[L1]]{{$}}
 ; CHECK-NEXT: 	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
 ; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}}
@@ -54,8 +54,8 @@ entry:
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
-; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2:
-; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1:
+; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32)
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
@@ -64,7 +64,7 @@ entry:
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
 
-; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4:
-; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2:
+; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32)
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function

From cfca2297822de682daf53124d3ca5a55fb3bfee4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 4 Nov 2025 17:57:54 -0800
Subject: [PATCH 263/313] AMDGPU: Add and clarify reserved address spaces
 (#166486)

Address spaces 10 and 11 are reserved for future use in the sense that
we plain to upstream their use.

Address space 12 is used by LLPC. It is used in a workaround for an
issue with SMEM accesses to PRT buffers that is specific to the LLPC
ecosystem and makes no sense to upstream.
---
 llvm/docs/AMDGPUUsage.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 7780c0a6dca0a..30b22a4a6d607 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -883,8 +883,9 @@ supported for the ``amdgcn`` target.
      Buffer Fat Pointer                    7               N/A         N/A              160     0
      Buffer Resource                       8               N/A         V#               128     0x00000000000000000000000000000000
      Buffer Strided Pointer (experimental) 9               *TODO*
-     *reserved for downstream use*         10
-     *reserved for downstream use*         11
+     *reserved for future use*             10
+     *reserved for future use*             11
+     *reserved for downstream use (LLPC)*  12
      Streamout Registers                   128             N/A         GS_REGS
      ===================================== =============== =========== ================ ======= ============================
 

From d6fdfe0a27d3d76b8f2adff359cb099573a30f9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 4 Nov 2025 18:18:39 -0800
Subject: [PATCH 264/313] CodeGen: Record tied virtual register operands in
 finalizeBundle (#166209)

This is in preparation of a future AMDGPU change where we are going to
create bundles before register allocation and want to rely on the
TwoAddressInstructionPass handling those bundles correctly.

v2:
- simplify the virtual register check and the test
---
 llvm/lib/CodeGen/MachineInstrBundle.cpp     | 22 ++++++++++++++++++++-
 llvm/test/CodeGen/AMDGPU/finalizebundle.mir |  2 +-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index da29ffc9d2fed..88d81993fbe55 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -136,6 +136,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSetVector<Register, 8> ExternUses;
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
+  SmallVector<std::pair<Register, Register>> TiedOperands;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -161,6 +162,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
           // External def is now killed.
           KilledUseSet.insert(Reg);
         }
+        if (MO.isTied() && Reg.isVirtual()) {
+          // Record tied operand constraints that involve virtual registers so
+          // that bundles that are formed pre-register allocation reflect the
+          // relevant constraints.
+          unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo());
+          MachineOperand &TiedMO = MII->getOperand(TiedIdx);
+          Register DefReg = TiedMO.getReg();
+          TiedOperands.emplace_back(DefReg, Reg);
+        }
       }
     }
 
@@ -203,7 +213,17 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     bool isKill = KilledUseSet.contains(Reg);
     bool isUndef = UndefUseSet.contains(Reg);
     MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
-               getImplRegState(true));
+                        getImplRegState(true));
+  }
+
+  for (auto [DefReg, UseReg] : TiedOperands) {
+    unsigned DefIdx =
+        std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg));
+    unsigned UseIdx =
+        std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg));
+    assert(DefIdx < LocalDefs.size());
+    assert(UseIdx < ExternUses.size());
+    MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index d2ec1fcbac84f..279f4298e6418 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -40,7 +40,7 @@ name: test_tied
 body: |
   bb.0:
     ; CHECK-LABEL: name: test_tied
-    ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32, implicit $mode, implicit $exec {
+    ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec {
     ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
     ; CHECK-NEXT:   [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec
     ; CHECK-NEXT: }

From d998f92a002bbdd78156716bad60523d7ddf1233 Mon Sep 17 00:00:00 2001
From: Abhay Kanhere <abhay@kanhere.net>
Date: Tue, 4 Nov 2025 18:39:31 -0800
Subject: [PATCH 265/313] [CodeGen] MachineVerifier to check early-clobber
 constraint (#151421)

Currently MachineVerifier is missing verifying early-clobber operand
constraint.
The only other machine operand constraint -  TiedTo is already verified.
---
 llvm/lib/CodeGen/MachineVerifier.cpp          |    8 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   15 +-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |   10 +-
 .../AMDGPU/GlobalISel/mul-known-bits.i64.ll   |   16 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    | 1048 +++---
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     | 2564 +++++++-------
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 1050 +++---
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     | 3064 ++++++++---------
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  842 ++---
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        |  123 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |  266 +-
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    |  518 +--
 .../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll |   74 +-
 llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll |  466 ++-
 14 files changed, 5068 insertions(+), 4996 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c0710c467a2e6..fdf10480b6e05 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       report("Extra explicit operand on non-variadic instruction", MO, MONum);
   }
 
+  // Verify earlyClobber def operand
+  if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) {
+    if (!MO->isReg())
+      report("Early clobber must be a register", MI);
+    if (!MO->isEarlyClobber())
+      report("Missing earlyClobber flag", MI);
+  }
+
   switch (MO->getType()) {
   case MachineOperand::MO_Register: {
     // Verify debug flag on debug instructions. Check this first because reg0
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 0c977416f1793..957d7164b686e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
   if (!DstRC || DstRC != SrcRC)
     return false;
 
-  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
-         RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+  if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+      !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+    return false;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -602,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
   I.setDesc(TII.get(Opc));
   I.addOperand(*MF, MachineOperand::CreateImm(0));
   I.addImplicitDefUseOperands(*MF);
+  I.getOperand(0).setIsEarlyClobber(true);
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
@@ -3787,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
   MI.removeOperand(1); // Intrinsic ID
   MI.addOperand(VDst_In); // Readd VDst_In to the end
   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23c77bbe..7b5621ff3b5a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
   %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
   %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0bfeb7e6..2351c969d5e49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7529364..7f10ee4c17450 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, v1
-; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT:    v_mov_b32_e32 v5, v2
+; GCN-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i64:
@@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
 ; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v2, v8
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i96:
@@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i96:
@@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v9
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i96:
@@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: v_mul_i96:
@@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX1250-NEXT:    v_mad_u32 v9, v2, v3, v5
 ; GFX1250-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i96 %num, %den
   ret i96 %result
@@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v10, v2
+; GFX7-NEXT:    v_mov_b32_e32 v11, v3
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v12, v4
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
 ; GFX7-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
 ; GFX7-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT:    v_mov_b32_e32 v2, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v11, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v4
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX9-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i128:
@@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
 ; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v11
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13
+; GFX10-NEXT:    v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v14, v7, s4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
 ; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
@@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
-; GFX11-NEXT:    v_mul_lo_u32 v4, v9, v6
-; GFX11-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX11-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX11-NEXT:    v_mul_lo_u32 v6, v9, v6
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v6, s0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v2, v13
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v14, v7, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
 ; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, v11
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT:    v_mov_b32_e32 v2, v13
+; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, null, v12, v7, s0
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, null, v14, v7, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
@@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v8, v4, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, v9, v6
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mov_b32_e32 v13, v10
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v11, v12
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
 ; GFX1250-NEXT:    v_mul_lo_u32 v8, v8, v7
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v13, v8, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
 ; GFX1250-NEXT:    v_mad_u32 v1, v2, v5, v1
@@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT:    v_mul_lo_u32 v28, v3, v12
+; GFX7-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
 ; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v22, v18
-; GFX7-NEXT:    v_mov_b32_e32 v18, v19
-; GFX7-NEXT:    v_mov_b32_e32 v19, v16
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX7-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX7-NEXT:    v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v21, v22
+; GFX7-NEXT:    v_mov_b32_e32 v22, v23
+; GFX7-NEXT:    v_mov_b32_e32 v23, v18
+; GFX7-NEXT:    v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX7-NEXT:    v_mul_lo_u32 v18, v6, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX7-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX7-NEXT:    v_mov_b32_e32 v20, v23
+; GFX7-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
 ; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT:    v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT:    v_mov_b32_e32 v21, v20
-; GFX7-NEXT:    v_mov_b32_e32 v20, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v12, v22
+; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX7-NEXT:    v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
 ; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
 ; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v10
+; GFX7-NEXT:    v_mov_b32_e32 v1, v13
+; GFX7-NEXT:    v_mov_b32_e32 v2, v14
+; GFX7-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT:    v_mul_lo_u32 v28, v3, v12
+; GFX8-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
 ; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v22, v18
-; GFX8-NEXT:    v_mov_b32_e32 v18, v19
-; GFX8-NEXT:    v_mov_b32_e32 v19, v16
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v21, v22
+; GFX8-NEXT:    v_mov_b32_e32 v22, v23
+; GFX8-NEXT:    v_mov_b32_e32 v23, v18
+; GFX8-NEXT:    v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX8-NEXT:    v_mul_lo_u32 v18, v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX8-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX8-NEXT:    v_mov_b32_e32 v20, v23
+; GFX8-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
 ; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT:    v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT:    v_mov_b32_e32 v21, v20
-; GFX8-NEXT:    v_mov_b32_e32 v20, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v22
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX8-NEXT:    v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
 ; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v10
+; GFX8-NEXT:    v_mov_b32_e32 v1, v13
+; GFX8-NEXT:    v_mov_b32_e32 v2, v14
+; GFX8-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT:    v_mul_lo_u32 v28, v3, v12
+; GFX9-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
 ; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-NEXT:    v_mov_b32_e32 v19, v16
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX9-NEXT:    v_addc_co_u32_e32 v26, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-NEXT:    v_mov_b32_e32 v23, v18
+; GFX9-NEXT:    v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX9-NEXT:    v_mul_lo_u32 v18, v6, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX9-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX9-NEXT:    v_mov_b32_e32 v20, v23
+; GFX9-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
 ; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT:    v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT:    v_mov_b32_e32 v21, v20
-; GFX9-NEXT:    v_mov_b32_e32 v20, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11]
 ; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v18, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:
@@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v17, v1
-; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
+; GFX10-NEXT:    v_mul_lo_u32 v29, v4, v11
+; GFX10-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX10-NEXT:    v_mul_lo_u32 v30, v2, v13
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v20, v22
-; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT:    v_mov_b32_e32 v20, v18
+; GFX10-NEXT:    v_mul_lo_u32 v28, v17, v14
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v12, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19]
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19]
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v10, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19]
+; GFX10-NEXT:    v_mov_b32_e32 v18, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
+; GFX10-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX10-NEXT:    v_mov_b32_e32 v19, v24
+; GFX10-NEXT:    v_mul_lo_u32 v24, v5, v10
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19]
 ; GFX10-NEXT:    v_mov_b32_e32 v19, v22
-; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT:    v_mul_lo_u32 v27, v16, v15
+; GFX10-NEXT:    v_mov_b32_e32 v18, v21
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22]
 ; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v21
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v32, s6
+; GFX10-NEXT:    v_mov_b32_e32 v14, v20
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19]
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s6, 0, v1, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22]
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, 0, v15, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v4, v12, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v6, v13, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v9, v14, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v15, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v25, v27, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v28, s8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v30, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v31, s7
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v29, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4
 ; GFX10-NEXT:    v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX11-NEXT:    v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
 ; GFX11-NEXT:    v_mul_lo_u32 v30, v4, v11
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v12, 0
+; GFX11-NEXT:    v_mul_lo_u32 v28, v16, v15
 ; GFX11-NEXT:    v_mul_lo_u32 v29, v17, v14
-; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT:    v_mov_b32_e32 v20, v8
-; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT:    v_mov_b32_e32 v21, v22
-; GFX11-NEXT:    v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT:    v_mul_lo_u32 v32, v3, v12
+; GFX11-NEXT:    v_mul_lo_u32 v31, v2, v13
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v12, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v10, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1]
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT:    v_mov_b32_e32 v6, v25
-; GFX11-NEXT:    v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8]
+; GFX11-NEXT:    v_mov_b32_e32 v7, v23
+; GFX11-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX11-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX11-NEXT:    v_mov_b32_e32 v8, v24
+; GFX11-NEXT:    v_mul_lo_u32 v24, v5, v10
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v18, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT:    v_mov_b32_e32 v11, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT:    v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT:    v_mov_b32_e32 v12, v24
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v23, v25, s5
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22]
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
+; GFX11-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v8, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v1, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22]
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v4, v8, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v27, v9, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v12, v6, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v7, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v25, v28, s5
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v31, s2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v32, s3
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, v7, v22, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, v7, v23, s0
 ; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mul_lo_u32 v29, v4, v11
+; GFX12-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX12-NEXT:    v_mul_lo_u32 v30, v2, v13
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT:    v_mul_lo_u32 v28, v17, v14
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v12, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v22
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v10, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_mov_b32_e32 v18, v23
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX12-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX12-NEXT:    v_mov_b32_e32 v19, v24
+; GFX12-NEXT:    v_mul_lo_u32 v24, v5, v10
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19]
 ; GFX12-NEXT:    v_mov_b32_e32 v19, v22
-; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT:    v_mul_lo_u32 v27, v16, v15
+; GFX12-NEXT:    v_mov_b32_e32 v18, v21
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v18
-; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT:    v_mul_lo_u32 v25, v3, v12
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT:    v_mul_lo_u32 v24, v2, v13
+; GFX12-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v32, s2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v1, s2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v15, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v4, v12, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v6, v13, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v9, v14, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v15, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v23, v22, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v24, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v25, v27, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v28, s4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v20, s1
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v30, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v31, s3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v29, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v27, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v23, s0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX1250-NEXT:    v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT:    v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT:    v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v14, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v12, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v16, v10, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT:    v_mul_lo_u32 v22, v6, v9
-; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v10, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT:    v_mul_lo_u32 v24, v6, v9
+; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s0
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT:    v_mov_b32_e32 v13, v18
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s2
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v8, 0
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT:    v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v21, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v33, s2
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v16, v15
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v25, v2, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v15
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v20, s4
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s3
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v32, s4
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v31, s3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v25, s1
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v27, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v30, s1
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
 ; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
@@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7-NEXT:    buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
 ; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dword v2, v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT:    flat_load_dword v4, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v2, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX12-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
@@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v4, v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX12-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX12-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4f2c454e13356..01c601f0646b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v3, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v8, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v14, v8
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT:    v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT:    v_mul_hi_u32 v7, v12, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v3
+; CHECK-NEXT:    v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT:    v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT:    v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT:    v_mul_hi_u32 v4, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT:    v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT:    v_mul_lo_u32 v5, v11, v9
+; CHECK-NEXT:    v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v4, v11, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v6
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v7
+; CHECK-NEXT:    v_mul_lo_u32 v4, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v7
+; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v9
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v7
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT:    v_mul_hi_u32 v9, v11, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v13, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s12, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v4, s12, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s13
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v4, s13, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s13
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s11
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s11
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[0:1], s13, v4
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s10, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v3
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
@@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v5, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v10
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v14
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v5
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v9, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v9
+; GISEL-NEXT:    v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT:    v_mul_lo_u32 v5, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT:    v_mul_lo_u32 v1, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v5, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v1
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v1
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v12, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v18, v1
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v1, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v12
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v11, v4, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, v11, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v16, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT:    v_trunc_f32_e32 v15, v11
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v15
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v10
+; GISEL-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v21, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v12
+; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v14
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v22, v11
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v15, v4
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v21, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v13, v15, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v17, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v22, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v18, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v14, v22, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v16, v11, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v22, v11, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v16, v13, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v13, v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v13
+; GISEL-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v15
+; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v17, v2, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v9, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v10
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v8
+; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v14, v0, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v2, v17, v1
+; GISEL-NEXT:    v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v3, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v1, v2
+; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v11, v13
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v13, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v17, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v17, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v8, v9, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v15, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v3, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v4
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v17, v5
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT:    v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v14, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT:    v_mul_lo_u32 v18, v17, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v14, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v4, v17, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT:    v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v3
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v15
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v15, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v15
+; CGP-NEXT:    v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v14, v4
+; CGP-NEXT:    v_xor_b32_e32 v13, v10, v15
+; CGP-NEXT:    v_mul_hi_u32 v10, v14, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v17, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v4, v17, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v11, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v17, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v11, v13, v4
 ; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v13, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v10, v11, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v16, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v10
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v15, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v5, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v6
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v5
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v16, v15, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v6, v15, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v5
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v6, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v13
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, v12, v6
+; CGP-NEXT:    v_xor_b32_e32 v14, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT:    v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v8, v15, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v6, v15, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v9, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v15, v6, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v14, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v14, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v14, v6
 ; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v14, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v14, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v15, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v13, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
@@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v16
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v16
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v18, v16
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v17
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v14
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v15
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v16
 ; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v15, v17, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0x12d8fb
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
@@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v19, v1
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v19, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v13
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v15, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v17, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v15, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v15
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v18
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v20, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v16
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v16, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v20, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v18, v14, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v12, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v9, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_xor_b32_e32 v10, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v9, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v9, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
@@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT:    v_xor_b32_e32 v17, v1, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v17, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT:    v_mul_lo_u32 v16, v17, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
-; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; CGP-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v13
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v18, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v17, v13, vcc
+; CGP-NEXT:    v_sub_i32_e64 v13, s[4:5], v17, v13
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT:    v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, 1, v16
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, 0, v18, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT:    v_cndmask_b32_e64 v20, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v20, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v19, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v17, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
@@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v14, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v12, v18, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT:    v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v13, v9
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v5, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CHECK-NEXT:    v_trunc_f32_e32 v7, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v14, v7
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT:    v_mul_lo_u32 v6, v14, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v14, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v3, v9
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v14, v9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v5
+; CHECK-NEXT:    v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT:    v_mul_lo_u32 v3, v14, v5
+; CHECK-NEXT:    v_mul_lo_u32 v6, v11, v9
+; CHECK-NEXT:    v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v4, v11, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v4, v14, v9
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v9
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v13, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v7, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v8
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v11
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v10
@@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
 ; GISEL-NEXT:    v_trunc_f32_e32 v13, v11
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT:    v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v19, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v19, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v7
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v7
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT:    v_mul_lo_u32 v1, v19, v14
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT:    v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v1
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v18, v1
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2]
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
 ; GISEL-NEXT:    v_xor_b32_e32 v10, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v9
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v12
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v11, v5, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, v11, v5, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v16, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT:    v_trunc_f32_e32 v15, v11
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v15
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v10
 ; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v8
+; GISEL-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v8
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v21, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v18, v12
+; GISEL-NEXT:    v_mul_lo_u32 v8, v18, v14
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v22, v11
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v15, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v21, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v13, v15, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v13, v17, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v22, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v18, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT:    v_mul_hi_u32 v14, v22, v14
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v16, v12, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v22, v12, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v12, v1, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v5, v1, v15
+; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v11
+; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v11
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v15, v0, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v14, v0, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v1
+; GISEL-NEXT:    v_mul_lo_u32 v3, v5, v0
+; GISEL-NEXT:    v_mul_hi_u32 v4, v5, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v7
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
@@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v12, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v14, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v16, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v16, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v10
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v15, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v10, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v18, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
 ; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
 ; CGP-NEXT:    v_trunc_f32_e32 v12, v11
 ; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT:    v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_mul_lo_u32 v12, v16, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v19, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT:    v_mul_lo_u32 v11, v19, v10
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT:    v_mul_hi_u32 v12, v16, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, v16, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v19, v14
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v16, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v10
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v8, v14
-; CGP-NEXT:    v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT:    v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v19, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v10
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT:    v_mul_lo_u32 v8, v19, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v16, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT:    v_mul_hi_u32 v9, v16, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v19, v14
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v14
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v19, v14
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v19, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v18, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v18, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v18, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v18, v9
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v12, v10
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v18, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v10, s[4:5], v18, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v8, v17, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v8
 ; CGP-NEXT:    v_xor_b32_e32 v1, v4, v8
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
@@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v6, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
 ; CGP-NEXT:    v_trunc_f32_e32 v10, v8
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT:    v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT:    v_mul_lo_u32 v6, v16, v8
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v16, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v6
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v8, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v10, v5, v12
-; CGP-NEXT:    v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT:    v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT:    v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v6, v11, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT:    v_mul_lo_u32 v5, v16, v8
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v6, v13, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v16, v8
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v14, v9
+; CGP-NEXT:    v_mul_lo_u32 v6, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v11
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT:    v_mul_hi_u32 v8, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v14, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v10, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v16, v6, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v6
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v12, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v15, v6
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8]
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
@@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v9, 0
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v9
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v10, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GISEL-NEXT:    v_trunc_f32_e32 v5, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v3
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v3
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v3
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v13, 0xffffff, v2
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v3
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v3
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6]
+; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v10
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v2
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v15, s[4:5], 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v8
 ; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, -1, v2, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v6
-; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v6
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v2, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v2, v17, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v9
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v15, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v9
+; GISEL-NEXT:    v_mul_hi_u32 v6, v17, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v15, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v9
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v2
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v2
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3]
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7]
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v8
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v6
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v12, v6, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v14, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v2
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v5, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0
+; GISEL-NEXT:    v_mul_hi_u32 v14, 0, v7
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v10, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v11, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v13, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
@@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v12
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2769,27 +2769,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v2
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v8, v3
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v1
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v8, v5
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591a5fcce..f4489c2239fda 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v3, s[0:1], s11, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v0, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-LABEL: sdivrem_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s17, 31
 ; GFX9-NEXT:    s_ashr_i32 s4, s19, 31
@@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3]
+; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT:    v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT:    global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v9, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_i64:
@@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_ashr_i32 s6, s19, 31
 ; GFX8-NEXT:    s_mov_b32 s7, s6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s8, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT:    v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
-; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v3
+; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s18, s6
 ; GFX8-NEXT:    s_addc_u32 s1, s19, s6
 ; GFX8-NEXT:    s_add_u32 s2, s2, s10
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v8
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v4
 ; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
 ; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
 ; GFX8-NEXT:    s_sub_u32 s5, 0, s2
-; GFX8-NEXT:    s_subb_u32 s20, 0, s3
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2]
+; GFX8-NEXT:    s_subb_u32 s20, 0, s3
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v11, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT:    v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v11, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_mul_hi_u32 v1, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, v11, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v12, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v11, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s17
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX8-NEXT:    v_xor_b32_e32 v5, s17, v7
+; GFX8-NEXT:    v_mov_b32_e32 v6, s17
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v5, v6, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX8-NEXT:    v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT:    v_mul_lo_u32 v7, v10, v5
+; GFX8-NEXT:    v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v11, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v7, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v5, v11, v5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v11, v4, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, s9, v2
+; GFX8-NEXT:    v_mul_lo_u32 v10, s8, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s4
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v9, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v6, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, s9, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s9
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT:    v_mul_hi_u32 v7, s9, v7
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v7, v6
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s9
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v7, s[0:1], s9, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v9
 ; GFX8-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v11, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v6, s1
+; GFX8-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, s6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX9-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_ashr_i32 s6, s19, 31
 ; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v4
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s18, s6
 ; GFX9-NEXT:    s_addc_u32 s1, s19, s6
 ; GFX9-NEXT:    s_add_u32 s2, s2, s10
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v14
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v15
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v14, vcc, s8, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v0
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_sub_u32 s5, 0, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v2, v12, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v3, v13, vcc
 ; GFX9-NEXT:    s_subb_u32 s20, 0, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v5, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v2, v12, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v14, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v13, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v15, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v4
+; GFX9-NEXT:    v_mul_hi_u32 v0, v12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v12, v2, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, s16, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s17
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v5
+; GFX9-NEXT:    v_mov_b32_e32 v12, s17
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, v11, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v12, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v6, v10, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s8, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v3
+; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
-; GFX9-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-NEXT:    v_add3_u32 v11, v6, v11, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT:    v_mov_b32_e32 v12, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT:    v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT:    v_sub_u32_e32 v7, s9, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v14
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v11, v14, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT:    v_mov_b32_e32 v9, s6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[12:13]
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[14:15]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 40b5db0a15447..6f42239cd191d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v6, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v13, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v13, vcc, v13, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v11
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v11, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v9, v3, v11
+; CHECK-NEXT:    v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v10, v8
+; CHECK-NEXT:    v_xor_b32_e32 v12, v4, v11
+; CHECK-NEXT:    v_mul_hi_u32 v4, v10, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v4, v13, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v12, v2
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v12, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v4, s11, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s11
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[0:1], s11, v4
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
 ; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
@@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v11, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT:    v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v4
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v13, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v12, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v12
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v1
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
@@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v1, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v1, v0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v9
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v16
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10]
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v1
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0
+; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1]
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v15, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13]
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v15, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v18, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v8
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v18, v9
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v19, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v19, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v15, v8
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v12, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v19, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v1
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v18, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v10, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v19, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v5
+; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v9, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v15, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v4, v3
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT:    v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT:    v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v17, v4
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT:    v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT:    v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v3
-; CGP-NEXT:    v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v2
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v17, v3, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v15
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v15, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v15
+; CGP-NEXT:    v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT:    v_xor_b32_e32 v16, v4, v15
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v14, v2
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v17, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v16, v2
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v10, v16, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v16, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v16, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v15
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v15
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v15, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  .LBB2_2: ; %Flow1
@@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v5
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v6
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT:    v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v4
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v11, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v4
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v5, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v13
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v13
+; CGP-NEXT:    v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT:    v_xor_b32_e32 v14, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v14, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v4
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v14, v5
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v14, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v14, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v14, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xfffff000
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v7, v18, v16
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT:    v_xor_b32_e32 v20, v1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v20, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT:    v_mul_lo_u32 v15, v20, v1
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v13, v7
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT:    v_xor_b32_e32 v17, v1, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v17, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT:    v_mul_lo_u32 v16, v17, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT:    v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
@@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT:    v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v13, v9
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v7, v18, v16
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT:    v_xor_b32_e32 v20, v1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v20, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT:    v_mul_lo_u32 v15, v20, v1
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v13, v7
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT:    v_xor_b32_e32 v17, v1, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v17, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT:    v_mul_lo_u32 v16, v17, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT:    v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
@@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT:    v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v13, v9
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v7, v5
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v13, v7
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT:    v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v13, vcc, v13, v5, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT:    v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v11, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v11
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v4, v11, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v9, v2, v11
+; CHECK-NEXT:    v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT:    v_xor_b32_e32 v12, v3, v11
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v6
+; CHECK-NEXT:    v_mul_lo_u32 v3, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v6
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v12, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v12, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v10, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT:    v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v18, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v18, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v18, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v14, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v1, v18, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v18, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v18, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v11
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[8:9], v6
 ; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v15, v11
+; GISEL-NEXT:    v_mul_lo_u32 v8, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v11
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
@@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT:    v_mul_hi_u32 v8, v15, v11
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v8, v6
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v0, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v15, v8
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT:    v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v10
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1]
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v15
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v14, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11]
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v14
 ; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v1
 ; GISEL-NEXT:    v_trunc_f32_e32 v13, v10
 ; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v13
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0
+; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], v16, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT:    v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v1, v19, v10
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v16, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v20, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[6:7]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v20, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, v11, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v14, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v14, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, v16, v21, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v14, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v11
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v19, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v19, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2]
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v18, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10]
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v20, v7, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v11
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v10, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
@@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v2
+; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v10, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v4, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v15, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v4, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v12, v10
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v18, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v11
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v18, v10
+; CGP-NEXT:    v_mul_lo_u32 v12, v15, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; CGP-NEXT:    v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v11
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT:    v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT:    v_mul_hi_u32 v8, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v16, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v16
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v16, vcc
+; CGP-NEXT:    v_xor_b32_e32 v14, v4, v16
+; CGP-NEXT:    v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v13
+; CGP-NEXT:    v_xor_b32_e32 v17, v8, v16
+; CGP-NEXT:    v_mul_hi_u32 v8, v15, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v18, v10
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT:    v_mul_lo_u32 v8, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT:    v_mul_hi_u32 v9, v15, v13
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT:    v_mul_hi_u32 v10, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v18, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v17, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v14, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v17, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v17, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v10, v14, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v17, v8
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v4
 ; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v14, v8
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v17, v11, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v11
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v9, v1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v16, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
@@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v6, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v4, v12
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v7, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v10, v4, v14
 ; CGP-NEXT:    v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT:    v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT:    v_mul_hi_u32 v5, v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT:    v_mul_lo_u32 v5, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_mul_hi_u32 v8, v6, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_mul_lo_u32 v8, v12, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v12, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v3
@@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v7
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GISEL-NEXT:    v_trunc_f32_e32 v5, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v3
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT:    v_mul_lo_u32 v0, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT:    v_mul_lo_u32 v0, v13, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v13, v3, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v7
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v8
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v7
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v0
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v0
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v7
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1]
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v11, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v10
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], 0, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v16, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v7
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v8
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v10
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v10
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v4
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v4
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, -1, v6, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, -1, v6, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v12, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6]
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v8, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, -1, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v10, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v7
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v4, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v4, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v6
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v13, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v12, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v7
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v5, v3
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9e412b6c7cd0a..23ef596c021c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, s9, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s8, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
-; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT:    v_subb_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s10, v3
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v4
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, s10, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v15, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v14, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
@@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s17, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s16, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT:    v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-NEXT:    v_mov_b32_e32 v4, s19
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s16, v0
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s18, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s18, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[2:3], s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v11, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v8, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i64:
@@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s13
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GFX8-NEXT:    s_sub_u32 s2, 0, s14
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    s_subb_u32 s3, 0, s15
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2]
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s8, v0
-; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v3, v4, s[0:1]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s15
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s14
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v2, v5, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v12, vcc, v2, v10, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v1
-; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, s12, v1
+; GFX8-NEXT:    v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX8-NEXT:    v_trunc_f32_e32 v4, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v7
-; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v4
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v4, v15, v2
-; GFX8-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX8-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v2
+; GFX8-NEXT:    v_add_u32_e64 v16, s[0:1], 1, v8
+; GFX8-NEXT:    v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v18, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX8-NEXT:    v_mul_lo_u32 v4, v18, v2
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v12, v10, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v5, v15, v6
+; GFX8-NEXT:    v_mul_hi_u32 v10, v15, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v18, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v15, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v17, v4
-; GFX8-NEXT:    v_mul_hi_u32 v17, v12, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v10, v18, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v6
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v16
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v13
+; GFX8-NEXT:    v_mul_hi_u32 v6, v18, v6
+; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v15, v3, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v2
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, v18, v4, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v10, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v7, v15, v4
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v5
-; GFX8-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v12, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, v18, v4
+; GFX8-NEXT:    v_mul_lo_u32 v8, v15, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v10, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v9, v15, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v20, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v5
-; GFX8-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v18, v7
+; GFX8-NEXT:    v_mul_hi_u32 v4, v18, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_mul_hi_u32 v8, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
-; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v7, v18, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v15, v4
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v18, v6, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v8, s11, v4
+; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v5, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s10, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT:    v_mul_lo_u32 v5, s11, v7
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v5, v8
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v5
+; GFX8-NEXT:    v_mul_hi_u32 v7, s11, v7
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v7, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v0, v10, s[0:1]
@@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX9-NEXT:    s_subb_u32 s3, 0, s7
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
@@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_add3_u32 v10, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s17, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, v4, s[0:1]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT:    v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, v2, v8, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v13, vcc, s4, v1
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v12, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v2
+; GFX9-NEXT:    v_add_co_u32_e64 v16, s[0:1], 1, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v4
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX9-NEXT:    v_mul_lo_u32 v4, v18, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v12, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v15, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v18, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v14
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT:    v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT:    v_mul_lo_u32 v8, v18, v6
+; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v18, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v17
-; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, v8, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v16
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v17, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT:    v_add3_u32 v3, v5, v4, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v5, v4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v15, vcc, v15, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, v18, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v18, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v8, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT:    v_mul_lo_u32 v5, v18, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v9, v15, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v18, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v15, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_mul_hi_u32 v7, v12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v5
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v9, v18, v7
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v15, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v18, v7
 ; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add3_u32 v5, v7, v6, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add3_u32 v5, v6, v5, v7
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v15, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s19, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s18, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s18, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v13, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v20, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54aeaef77b..26f77898faf60 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
-; GISEL-NEXT:    v_mul_lo_u32 v24, v30, v19
-; GISEL-NEXT:    v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT:    v_mul_lo_u32 v27, v30, v19
+; GISEL-NEXT:    v_mul_lo_u32 v36, v29, v18
 ; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
-; GISEL-NEXT:    v_mul_lo_u32 v26, v35, v3
-; GISEL-NEXT:    v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT:    v_mul_lo_u32 v37, v35, v3
+; GISEL-NEXT:    v_mul_lo_u32 v38, v34, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
 ; GISEL-NEXT:    v_mov_b32_e32 v22, v19
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
-; GISEL-NEXT:    v_mov_b32_e32 v23, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT:    v_mov_b32_e32 v2, v23
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2]
+; GISEL-NEXT:    v_mov_b32_e32 v23, v25
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2]
+; GISEL-NEXT:    v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v3, v36, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v28
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
-; GISEL-NEXT:    v_xor_b32_e32 v16, v12, v33
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v28
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v14, v14, v33
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v17, v22, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v19, v0, v28
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v12, v18
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v18, v2, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v16, v28
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v19, v28
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v16, v33
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v28
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13]
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v18, v33
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9]
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v8, v23, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v16, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v15, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v33
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v28
-; GISEL-NEXT:    v_xor_b32_e32 v8, v3, v33
-; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v33
+; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v3, v28, s[6:7]
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v28, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v8, v33, vcc
@@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
-; GISEL-NEXT:    v_mul_lo_u32 v28, v8, v21
-; GISEL-NEXT:    v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v34, v8, v21
+; GISEL-NEXT:    v_mul_lo_u32 v35, v9, v20
 ; GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
-; GISEL-NEXT:    v_mul_lo_u32 v30, v12, v19
-; GISEL-NEXT:    v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT:    v_mul_lo_u32 v36, v12, v19
+; GISEL-NEXT:    v_mul_lo_u32 v37, v13, v18
 ; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
-; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
-; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
-; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT:    v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v26
+; GISEL-NEXT:    v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18]
+; GISEL-NEXT:    v_mov_b32_e32 v22, v28
+; GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31]
+; GISEL-NEXT:    v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18]
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v35, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v21, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v20
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v22, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v9, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v10, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = urem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index e0421575c3174..460f1211d1386 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
@@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:  .LBB0_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:  .LBB0_7: ; %Flow2
@@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
@@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:  .LBB1_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:  .LBB1_7: ; %Flow2
@@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
@@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GISEL-NEXT:  .LBB2_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB2_7: ; %Flow2
@@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
@@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GISEL-NEXT:  .LBB3_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB3_7: ; %Flow2
@@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v10, 0xffffff7a, v5
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v10, v[6:7]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB6_4: ; %Flow
@@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v10, 0xffffff7a, v5
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v10, v[6:7]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB7_4: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 31b6b533866d4..f705a2ffc4f1d 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v8
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, v0, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, v0, v8
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, v4, v7, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v5
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, v6, v9, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v6
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v7, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7]
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v5, v5, v2
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v4, v3, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v8
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, v1, v9, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v6, v5, v8
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v6, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v6, v3, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5]
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v2
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v2
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, v2, v14
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v14
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, v2, v16
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v13
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v9
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v13
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v2
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 1, v2
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, v0, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, v2, v14
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, v2, v16
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v11, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v2
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v9
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v2
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v12
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v17, vcc, v2, v14
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v14
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, v12, v15, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, v2, v16
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v11, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v19, vcc, v8, v17, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v12, vcc
 ; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v10
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v13
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v9
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v16, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v10
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v14, vcc, v1, v11, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v8
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, v2, v12
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, v3, v13, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11]
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v9, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v8
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v9, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v8, v3, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3]
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v8
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, v0, v12
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9]
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v1, v13, vcc
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v10
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, v2, v14
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, v3, v15, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v10, v5, v12
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v11, vcc, 1, v2
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v13, vcc, 1, v8
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v15, vcc, 1, v4
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v8, v3, v6
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1]
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v10, vcc
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v7, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1ab4cb0f00192..d82d6bcb437cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
 ; GISEL12-NEXT:    v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
 ; GISEL12-NEXT:    s_wait_kmcnt 0x0
 ; GISEL12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL12-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
-; GISEL12-NEXT:    v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
-; GISEL12-NEXT:    v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
-; GISEL12-NEXT:    v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
-; GISEL12-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL12-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL12-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL12-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
+; GISEL12-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GISEL12-NEXT:    v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
+; GISEL12-NEXT:    v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
+; GISEL12-NEXT:    v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
+; GISEL12-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
+; GISEL12-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
+; GISEL12-NEXT:    v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
+; GISEL12-NEXT:    v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
 ; GISEL12-NEXT:    s_mov_b32 exec_lo, s9
-; GISEL12-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL12-NEXT:    v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
+; GISEL12-NEXT:    v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
+; GISEL12-NEXT:    v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
+; GISEL12-NEXT:    v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
+; GISEL12-NEXT:    v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
+; GISEL12-NEXT:    v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
+; GISEL12-NEXT:    v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
+; GISEL12-NEXT:    v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
 ; GISEL12-NEXT:  .LBB5_2: ; %tail
 ; GISEL12-NEXT:    s_wait_alu 0xfffe
 ; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
 ; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GISEL10-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL10-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL10-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL10-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL10-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL10-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL10-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL10-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL10-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL10-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL10-NEXT:    v_mov_b32_e32 v34, v10
-; GISEL10-NEXT:    v_mov_b32_e32 v35, v11
-; GISEL10-NEXT:    v_mov_b32_e32 v36, v12
-; GISEL10-NEXT:    v_mov_b32_e32 v37, v13
-; GISEL10-NEXT:    v_mov_b32_e32 v38, v14
-; GISEL10-NEXT:    v_mov_b32_e32 v39, v15
+; GISEL10-NEXT:    v_mov_b32_e32 v40, v0
+; GISEL10-NEXT:    v_mov_b32_e32 v41, v1
+; GISEL10-NEXT:    v_mov_b32_e32 v42, v2
+; GISEL10-NEXT:    v_mov_b32_e32 v43, v3
+; GISEL10-NEXT:    v_mov_b32_e32 v44, v4
+; GISEL10-NEXT:    v_mov_b32_e32 v45, v5
+; GISEL10-NEXT:    v_mov_b32_e32 v46, v6
+; GISEL10-NEXT:    v_mov_b32_e32 v47, v7
+; GISEL10-NEXT:    v_mov_b32_e32 v48, v8
+; GISEL10-NEXT:    v_mov_b32_e32 v49, v9
+; GISEL10-NEXT:    v_mov_b32_e32 v50, v10
+; GISEL10-NEXT:    v_mov_b32_e32 v51, v11
+; GISEL10-NEXT:    v_mov_b32_e32 v52, v12
+; GISEL10-NEXT:    v_mov_b32_e32 v53, v13
+; GISEL10-NEXT:    v_mov_b32_e32 v54, v14
+; GISEL10-NEXT:    v_mov_b32_e32 v55, v15
 ; GISEL10-NEXT:    s_mov_b32 exec_lo, s9
-; GISEL10-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL10-NEXT:    v_mov_b32_e32 v24, v40
+; GISEL10-NEXT:    v_mov_b32_e32 v25, v41
+; GISEL10-NEXT:    v_mov_b32_e32 v26, v42
+; GISEL10-NEXT:    v_mov_b32_e32 v27, v43
+; GISEL10-NEXT:    v_mov_b32_e32 v28, v44
+; GISEL10-NEXT:    v_mov_b32_e32 v29, v45
+; GISEL10-NEXT:    v_mov_b32_e32 v30, v46
+; GISEL10-NEXT:    v_mov_b32_e32 v31, v47
+; GISEL10-NEXT:    v_mov_b32_e32 v32, v48
+; GISEL10-NEXT:    v_mov_b32_e32 v33, v49
+; GISEL10-NEXT:    v_mov_b32_e32 v34, v50
+; GISEL10-NEXT:    v_mov_b32_e32 v35, v51
+; GISEL10-NEXT:    v_mov_b32_e32 v36, v52
+; GISEL10-NEXT:    v_mov_b32_e32 v37, v53
+; GISEL10-NEXT:    v_mov_b32_e32 v38, v54
+; GISEL10-NEXT:    v_mov_b32_e32 v39, v55
 ; GISEL10-NEXT:  .LBB5_2: ; %tail
 ; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GISEL10-NEXT:    v_mov_b32_e32 v8, v24
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 4d5ade4abcef7..1b4ed67eb6eea 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v8, v7, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v9, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, v5, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v13
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v13, v20
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v21
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v13
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v13, v20
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v21
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v17, v17, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v3, v4
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v18, v1, v18
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v3, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v17, v17, v20
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v6
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v2
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9]
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v3, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v7, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v9, v33
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v24, v25
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v8, v9
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX7-GISEL-NEXT:    buffer_load_dword v9, off, s[0:3], s32
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v17
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX7-GISEL-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v32
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v18
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, v13
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v9, v33
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v24, v25
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, v9
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX8-GISEL-NEXT:    buffer_load_dword v9, off, s[0:3], s32
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v17
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX8-GISEL-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v32
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v18
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, v13
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX9-GISEL-NEXT:    scratch_load_dword v31, off, s32
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v33, v33, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v18, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v19, v3, v4
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v20, v3, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v20, v5, v6
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v21, v5, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v7, v8
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v34, v1, v34
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v7, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v9, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v9, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v11, v12
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v33, v33, v36
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v11, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v13, v14
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v11, v13, v18
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v11, v15, v6
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v13, v15, v18
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v15, v7, v0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5]
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9]
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, v9, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v5, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v9, v6
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64:

From 5c8bcf7dc7461097a0d2b1a43fc46ab37363adc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 4 Nov 2025 16:44:58 -1000
Subject: [PATCH 266/313] [flang][cuda][NFC] Move CUDA intrinsics lowering to a
 separate file (#166461)

Just move all CUDA related intrinsics lowering to a separate file to
avoid clobbering the main Fortran intrinsic file.
---
 .../Optimizer/Builder/CUDAIntrinsicCall.h     |   95 +
 .../flang/Optimizer/Builder/IntrinsicCall.h   |   64 -
 flang/lib/Optimizer/Builder/CMakeLists.txt    |    1 +
 .../Optimizer/Builder/CUDAIntrinsicCall.cpp   | 1588 +++++++++++++++++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 1346 +-------------
 5 files changed, 1691 insertions(+), 1403 deletions(-)
 create mode 100644 flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
 create mode 100644 flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp

diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
new file mode 100644
index 0000000000000..d735ce95a83dc
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -0,0 +1,95 @@
+//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H
+#define FORTRAN_LOWER_CUDAINTRINSICCALL_H
+
+#include "flang/Optimizer/Builder/IntrinsicCall.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+namespace fir {
+
+struct CUDAIntrinsicLibrary : IntrinsicLibrary {
+
+  // Constructors.
+  explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc)
+      : IntrinsicLibrary(builder, loc) {}
+  CUDAIntrinsicLibrary() = delete;
+  CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete;
+
+  // CUDA intrinsic handlers.
+  mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicAddR2(mlir::Type,
+                                    llvm::ArrayRef<fir::ExtendedValue>);
+  template <int extent>
+  fir::ExtendedValue genAtomicAddVector(mlir::Type,
+                                        llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicCas(mlir::Type,
+                                  llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicExch(mlir::Type,
+                                   llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicXor(mlir::Type,
+                                  llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
+  template <const char *fctName, int extent>
+  fir::ExtendedValue genLDXXFunc(mlir::Type,
+                                 llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  template <typename OpTy>
+  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
+  void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
+  void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+  template <mlir::NVVM::VoteSyncKind kind>
+  mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+};
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name);
+
+} // namespace fir
+
+#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index b64419f5ae6da..01d27fd5fc399 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -19,7 +19,6 @@
 #include "flang/Runtime/iostat-consts.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include <optional>
 
@@ -187,25 +186,6 @@ struct IntrinsicLibrary {
   mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicAddR2(mlir::Type,
-                                    llvm::ArrayRef<fir::ExtendedValue>);
-  template <int extent>
-  fir::ExtendedValue genAtomicAddVector(mlir::Type,
-                                        llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicCas(mlir::Type,
-                                  llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicExch(mlir::Type,
-                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicXor(mlir::Type,
-                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue
       genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
@@ -213,11 +193,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genAssociated(mlir::Type,
                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genBesselJn(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genBesselYn(mlir::Type,
@@ -239,9 +214,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  template <const char *fctName, int extent>
-  fir::ExtendedValue genCUDALDXXFunc(mlir::Type,
-                                     llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type,
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
@@ -281,7 +253,6 @@ struct IntrinsicLibrary {
                                       llvm::ArrayRef<fir::ExtendedValue>);
   template <Extremum, ExtremumBehavior>
   mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genFraction(mlir::Type resultType,
                           mlir::ArrayRef<mlir::Value> args);
@@ -373,8 +344,6 @@ struct IntrinsicLibrary {
   mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <typename Shift>
   mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genMatmulTranspose(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
@@ -397,8 +366,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genNumImages(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  template <typename OpTy>
-  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genPerror(llvm::ArrayRef<fir::ExtendedValue>);
@@ -453,56 +420,25 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>);
   void genSleep(llvm::ArrayRef<fir::ExtendedValue>);
-  void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genSystem(std::optional<mlir::Type>,
                                mlir::ArrayRef<fir::ExtendedValue> args);
   void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genTransfer(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genTranspose(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genThisImage(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
-  void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
-  void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType,
                                llvm::ArrayRef<fir::ExtendedValue> args);
   fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  template <mlir::NVVM::VoteSyncKind kind>
-  mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
 
   /// Implement all conversion functions like DBLE, the first argument is
   /// the value to convert. There may be an additional KIND arguments that
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 1f95259a857da..37c9c2d703c76 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FIRBuilder
   BoxValue.cpp
   Character.cpp
   Complex.cpp
+  CUDAIntrinsicCall.cpp
   CUFCommon.cpp
   DoLoopHelper.cpp
   FIRBuilder.cpp
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
new file mode 100644
index 0000000000000..4e276a72897fe
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -0,0 +1,1588 @@
+//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper routines for constructing the FIR dialect of MLIR for PowerPC
+// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style
+// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
+// module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
+#include "flang/Evaluate/common.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/MutableBox.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+namespace fir {
+
+using CI = CUDAIntrinsicLibrary;
+
+static const char __ldca_i4x4[] = "__ldca_i4x4_";
+static const char __ldca_i8x2[] = "__ldca_i8x2_";
+static const char __ldca_r2x2[] = "__ldca_r2x2_";
+static const char __ldca_r4x4[] = "__ldca_r4x4_";
+static const char __ldca_r8x2[] = "__ldca_r8x2_";
+static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
+static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
+static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
+static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
+static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
+static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
+static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
+static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
+static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
+static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
+static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
+static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
+static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
+static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
+static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
+static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
+static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
+static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
+static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
+static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
+
+// CUDA specific intrinsic handlers.
+static constexpr IntrinsicHandler cudaHandlers[]{
+    {"__ldca_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"all_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"any_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"atomicadd_r4x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicadd_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<4>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddr2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r4x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicandi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomiccasd",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasf",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasi",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasul",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomicdeci",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchd",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchf",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchi",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchul",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicinci",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmind",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicminf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmini",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicminl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicori",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicxori",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"ballot_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_arrive",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierArrive),
+     {{{"barrier", asAddr}}},
+     /*isElemental=*/false},
+    {"barrier_arrive_cnt",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierArriveCnt),
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_init",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genBarrierInit),
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierTryWait),
+     {{{"barrier", asAddr}, {"token", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait_sleep",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierTryWaitSleep),
+     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
+     /*isElemental=*/false},
+    {"clock",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::ClockOp>),
+     {},
+     /*isElemental=*/false},
+    {"clock64",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
+     {},
+     /*isElemental=*/false},
+    {"fence_proxy_async",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genFenceProxyAsync),
+     {},
+     /*isElemental=*/false},
+    {"globaltimer",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>),
+     {},
+     /*isElemental=*/false},
+    {"match_all_syncjd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjj",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjx",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_any_syncjd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjj",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjx",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"syncthreads",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genSyncThreads),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_and_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsAnd),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_and_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsAnd),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsCount),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsCount),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsOr),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsOr),
+     {},
+     /*isElemental=*/false},
+    {"syncwarp",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
+     {},
+     /*isElemental=*/false},
+    {"this_grid",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
+     {},
+     /*isElemental=*/false},
+    {"this_thread_block",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genThisThreadBlock),
+     {},
+     /*isElemental=*/false},
+    {"this_warp",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp),
+     {},
+     /*isElemental=*/false},
+    {"threadfence",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFence),
+     {},
+     /*isElemental=*/false},
+    {"threadfence_block",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFenceBlock),
+     {},
+     /*isElemental=*/false},
+    {"threadfence_system",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFenceSystem),
+     {},
+     /*isElemental=*/false},
+    {"tma_bulk_commit_group",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkCommitGroup),
+     {{}},
+     /*isElemental=*/false},
+    {"tma_bulk_g2s",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nbytes", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadC4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadC8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadI4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadI8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr2",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR2),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_s2g",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G),
+     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreC4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreC8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreI4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreI8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r2",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR2),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_wait_group",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkWaitGroup),
+     {{}},
+     /*isElemental=*/false},
+};
+
+template <std::size_t N>
+static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) {
+  // Replace by std::sorted when C++20 is default (will be constexpr).
+  const IntrinsicHandler *lastSeen{nullptr};
+  bool isSorted{true};
+  for (const auto &x : array) {
+    if (lastSeen)
+      isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name};
+    lastSeen = &x;
+  }
+  return isSorted;
+}
+static_assert(isSorted(cudaHandlers) && "map must be sorted");
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) {
+  auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) {
+    return name.compare(cudaHandler.name) > 0;
+  };
+  auto result = llvm::lower_bound(cudaHandlers, name, compare);
+  return result != std::end(cudaHandlers) && result->name == name ? result
+                                                                  : nullptr;
+}
+
+static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
+                                         mlir::Location loc,
+                                         mlir::Value barrier,
+                                         mlir::NVVM::NVVMMemorySpace space) {
+  mlir::Value llvmPtr = fir::ConvertOp::create(
+      builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
+      barrier);
+  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
+      builder, loc,
+      mlir::LLVM::LLVMPointerType::get(builder.getContext(),
+                                       static_cast<unsigned>(space)),
+      llvmPtr);
+  return addrCast;
+}
+
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+                                mlir::Value arg1) {
+  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
+                                         mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+// ATOMICADD
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::add
+          : mlir::LLVM::AtomicBinOp::fadd;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
+                                     llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+
+  mlir::Value a = fir::getBase(args[0]);
+
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+
+  auto loc = builder.getUnknownLoc();
+  auto f16Ty = builder.getF16Type();
+  auto i32Ty = builder.getI32Type();
+  auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
+  mlir::Type idxTy = builder.getIndexType();
+  auto f16RefTy = fir::ReferenceType::get(f16Ty);
+  auto zero = builder.createIntegerConstant(loc, idxTy, 0);
+  auto one = builder.createIntegerConstant(loc, idxTy, 1);
+  auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), zero);
+  auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), one);
+  auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
+  auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
+  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
+  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+  auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
+  auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
+  mlir::Value vecI32 =
+      mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
+  return mlir::vector::ExtractOp::create(builder, loc, vecI32,
+                                         mlir::ArrayRef<int64_t>{0});
+}
+
+// ATOMICADDVECTOR
+template <int extent>
+fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector(
+    mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(
+      builder, loc, fir::SequenceType::get({extent}, resultType));
+  mlir::Value a = fir::getBase(args[0]);
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+  auto vecTy = mlir::VectorType::get({extent}, resultType);
+  auto refTy = fir::ReferenceType::get(resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Type idxTy = builder.getIndexType();
+
+  // Extract the values from the array.
+  llvm::SmallVector<mlir::Value> values;
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), pos);
+    mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+    values.push_back(value);
+  }
+  // Pack extracted values into a vector to call the atomic add.
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value insert = mlir::LLVM::InsertElementOp::create(
+        builder, loc, undef, values[i],
+        builder.createIntegerConstant(loc, i32Ty, i));
+    undef = insert;
+  }
+  // Atomic operation with a vector of values.
+  mlir::Value add =
+      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
+  // Store results in the result array.
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value r = mlir::LLVM::ExtractElementOp::create(
+        builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
+    mlir::Value c = fir::CoordinateOp::create(
+        builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
+    fir::StoreOp::create(builder, loc, r, c);
+  }
+  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICCAS
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
+  auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
+
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  mlir::Value arg2 = fir::getBase(args[2]);
+
+  auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
+    if (mlir::isa<mlir::Float32Type>(arg.getType()))
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
+                                           arg);
+    if (mlir::isa<mlir::Float64Type>(arg.getType()))
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
+                                           arg);
+    return arg;
+  };
+
+  arg1 = bitCastFloat(arg1);
+  arg2 = bitCastFloat(arg2);
+
+  if (arg1.getType() != arg2.getType()) {
+    // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
+    arg2 = builder.createConvert(loc, arg1.getType(), arg2);
+  }
+
+  auto address =
+      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
+          .getResult(0);
+  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
+      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
+  mlir::Value boolResult =
+      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  return builder.createConvert(loc, resultType, boolResult);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICEXCH
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType,
+                                    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  assert(arg1.getType().isIntOrFloat());
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
+  return genAtomBinOp(builder, loc, binOp, arg0, arg1);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::max
+          : mlir::LLVM::AtomicBinOp::fmax;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::min
+          : mlir::LLVM::AtomicBinOp::fmin;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICSUB
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::sub
+          : mlir::LLVM::AtomicBinOp::fsub;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICXOR
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
+}
+
+// BARRIER_ARRIVE
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+  return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
+                                                    barrier)
+      .getResult();
+}
+
+// BARRIER_ARRIBVE_CNT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+  return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
+                                         {barrier, args[1]}, {},
+                                         "mbarrier.arrive.expect_tx.release."
+                                         "cta.shared::cta.b64 %0, [%1], %2;",
+                                         {})
+      .getResult(0);
+}
+
+// BARRIER_INIT
+void CUDAIntrinsicLibrary::genBarrierInit(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
+                                     fir::getBase(args[1]), {});
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// BARRIER_TRY_WAIT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
+  fir::StoreOp::create(builder, loc, zero, res);
+  mlir::Value ns =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
+  mlir::Value load = fir::LoadOp::create(builder, loc, res);
+  auto whileOp = mlir::scf::WhileOp::create(
+      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
+  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
+  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(beforeBlock);
+  mlir::Value condition = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
+  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
+  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
+  afterBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(afterBlock);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
+                        builder, loc, {resultType}, {barrier, args[1], ns}, {},
+                        "{\n"
+                        "  .reg .pred p;\n"
+                        "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+                        "  selp.b32 %0, 1, 0, p;\n"
+                        "}",
+                        {})
+                        .getResult(0);
+  mlir::scf::YieldOp::create(builder, loc, ret);
+  builder.setInsertionPointAfter(whileOp);
+  return whileOp.getResult(0);
+}
+
+// BARRIER_TRY_WAIT_SLEEP
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
+                                             llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  return mlir::NVVM::InlinePtxOp::create(
+             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
+             "{\n"
+             "  .reg .pred p;\n"
+             "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+             "  selp.b32 %0, 1, 0, p;\n"
+             "}",
+             {})
+      .getResult(0);
+}
+
+// FENCE_PROXY_ASYNC
+void CUDAIntrinsicLibrary::genFenceProxyAsync(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// __LDCA, __LDCS, __LDLU, __LDCV
+template <const char *fctName, int extent>
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType,
+                                  llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Type resTy = fir::SequenceType::get(extent, resultType);
+  mlir::Value arg = fir::getBase(args[0]);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
+  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+    arg = fir::BoxAddrOp::create(builder, loc, arg);
+  mlir::Type refResTy = fir::ReferenceType::get(resTy);
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
+  auto funcOp = builder.createFunction(loc, fctName, ftype);
+  llvm::SmallVector<mlir::Value> funcArgs;
+  funcArgs.push_back(res);
+  funcArgs.push_back(arg);
+  fir::CallOp::create(builder, loc, funcOp, funcArgs);
+  mlir::Value ext =
+      builder.createIntegerConstant(loc, builder.getIndexType(), extent);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
+// CLOCK, CLOCK64, GLOBALTIMER
+template <typename OpTy>
+mlir::Value
+CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0 && "expect no arguments");
+  return OpTy::create(builder, loc, resultType).getResult();
+}
+
+// MATCH_ALL_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  mlir::Type i1Ty = builder.getI1Type();
+  mlir::MLIRContext *context = builder.getContext();
+
+  mlir::Value arg1 = args[1];
+  if (arg1.getType().isF32() || arg1.getType().isF64())
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+  mlir::Type retTy =
+      mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
+  auto match =
+      mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
+                                      mlir::NVVM::MatchSyncKind::all)
+          .getResult();
+  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
+  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
+  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
+  fir::StoreOp::create(builder, loc, conv, args[2]);
+  return value;
+}
+
+// MATCH_ANY_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  mlir::Value arg1 = args[1];
+  if (arg1.getType().isF32() || arg1.getType().isF64())
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+  return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
+                                         arg1, mlir::NVVM::MatchSyncKind::any)
+      .getResult();
+}
+
+// SYNCTHREADS
+void CUDAIntrinsicLibrary::genSyncThreads(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  mlir::NVVM::Barrier0Op::create(builder, loc);
+}
+
+// SYNCTHREADS_AND
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCTHREADS_COUNT
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCTHREADS_OR
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCWARP
+void CUDAIntrinsicLibrary::genSyncWarp(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
+  mlir::Value mask = fir::getBase(args[0]);
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> argsList{mask};
+  fir::CallOp::create(builder, loc, funcOp, argsList);
+}
+
+// THIS_GRID
+mlir::Value
+CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+
+  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
+
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
+
+  // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
+  // (blockDim.x * gridDim.x);
+  mlir::Value resZ =
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
+  mlir::Value resY =
+      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
+  mlir::Value resX =
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
+  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
+
+  // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
+  //   blockIdx.x;
+  // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
+  //   ((threadIdx.z * blockDim.y) * blockDim.x) +
+  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
+
+  mlir::Value bXbY =
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
+  mlir::Value bXbYbZ =
+      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
+  mlir::Value tZbY =
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value tZbYbX =
+      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
+  mlir::Value tYbX =
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THIS_THREAD_BLOCK
+mlir::Value
+CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value size =
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
+  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
+
+  // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
+  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THIS_WARP
+mlir::Value
+CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // coalesced_group%size = 32
+  mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  // coalesced_group%rank = threadIdx.x & 31 + 1
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  mlir::Value masked =
+      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THREADFENCE
+void CUDAIntrinsicLibrary::genThreadFence(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> noArgs;
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
+}
+
+// THREADFENCE_BLOCK
+void CUDAIntrinsicLibrary::genThreadFenceBlock(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> noArgs;
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
+}
+
+// THREADFENCE_SYSTEM
+void CUDAIntrinsicLibrary::genThreadFenceSystem(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> noArgs;
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
+}
+
+// TMA_BULK_COMMIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkCommitGroup(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
+}
+
+// TMA_BULK_G2S
+void CUDAIntrinsicLibrary::genTMABulkG2S(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst =
+      convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
+                            mlir::NVVM::NVVMMemorySpace::SharedCluster);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
+      builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
+}
+
+static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value barrier, mlir::Value src,
+                           mlir::Value dst, mlir::Value nelem,
+                           mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+  dst = builder.createConvert(loc, llvmPtrTy, dst);
+  src = builder.createConvert(loc, llvmPtrTy, src);
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
+      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
+      "[%1], %2, [%3];",
+      {});
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
+      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
+}
+
+// TMA_BULK_LOADC4
+void CUDAIntrinsicLibrary::genTMABulkLoadC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADC8
+void CUDAIntrinsicLibrary::genTMABulkLoadC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI4
+void CUDAIntrinsicLibrary::genTMABulkLoadI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI8
+void CUDAIntrinsicLibrary::genTMABulkLoadI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR2
+void CUDAIntrinsicLibrary::genTMABulkLoadR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR4
+void CUDAIntrinsicLibrary::genTMABulkLoadR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR8
+void CUDAIntrinsicLibrary::genTMABulkLoadR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_S2G
+void CUDAIntrinsicLibrary::genTMABulkS2G(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
+                                          mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
+      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group;", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value src, mlir::Value dst, mlir::Value count,
+                            mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+  src = convertPtrToNVVMSpace(builder, loc, src,
+                              mlir::NVVM::NVVMMemorySpace::Shared);
+  dst = convertPtrToNVVMSpace(builder, loc, dst,
+                              mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
+                                                     size, {}, {});
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group;", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+// TMA_BULK_STORE_C4
+void CUDAIntrinsicLibrary::genTMABulkStoreC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_C8
+void CUDAIntrinsicLibrary::genTMABulkStoreC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I4
+void CUDAIntrinsicLibrary::genTMABulkStoreI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I8
+void CUDAIntrinsicLibrary::genTMABulkStoreI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R2
+void CUDAIntrinsicLibrary::genTMABulkStoreR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R4
+void CUDAIntrinsicLibrary::genTMABulkStoreR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R8
+void CUDAIntrinsicLibrary::genTMABulkStoreR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_WAIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkWaitGroup(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
+}
+
+// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
+template <mlir::NVVM::VoteSyncKind kind>
+mlir::Value
+CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value arg1 =
+      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
+  mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
+                         ? builder.getI32Type()
+                         : builder.getI1Type();
+  auto voteRes =
+      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
+          .getResult();
+  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
+}
+
+} // namespace fir
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 3156c8cb4332c..3eb60448fae38 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Common/static-multimap-view.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/Complex.h"
@@ -50,7 +51,6 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -108,34 +108,6 @@ using I = IntrinsicLibrary;
 /// argument is an optional variable in the current scope).
 static constexpr bool handleDynamicOptional = true;
 
-/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to
-/// PPC.
-static const char __ldca_i4x4[] = "__ldca_i4x4_";
-static const char __ldca_i8x2[] = "__ldca_i8x2_";
-static const char __ldca_r2x2[] = "__ldca_r2x2_";
-static const char __ldca_r4x4[] = "__ldca_r4x4_";
-static const char __ldca_r8x2[] = "__ldca_r8x2_";
-static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
-static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
-static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
-static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
-static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
-static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
-static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
-static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
-static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
-static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
-static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
-static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
-static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
-static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
-static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
-static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
-static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
-static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
-static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
-static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
-
 /// Table that drives the fir generation depending on the intrinsic or intrinsic
 /// module procedure one to one mapping with Fortran arguments. If no mapping is
 /// defined here for a generic intrinsic, genRuntimeCall will be called
@@ -144,106 +116,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
 /// argument must not be lowered by value. In which case, the lowering rules
 /// should be provided for all the intrinsic arguments for completeness.
 static constexpr IntrinsicHandler handlers[]{
-    {"__ldca_i4x4",
-     &I::genCUDALDXXFunc<__ldca_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_i8x2",
-     &I::genCUDALDXXFunc<__ldca_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r2x2",
-     &I::genCUDALDXXFunc<__ldca_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r4x4",
-     &I::genCUDALDXXFunc<__ldca_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r8x2",
-     &I::genCUDALDXXFunc<__ldca_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_i4x4",
-     &I::genCUDALDXXFunc<__ldcg_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_i8x2",
-     &I::genCUDALDXXFunc<__ldcg_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r2x2",
-     &I::genCUDALDXXFunc<__ldcg_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r4x4",
-     &I::genCUDALDXXFunc<__ldcg_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r8x2",
-     &I::genCUDALDXXFunc<__ldcg_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_i4x4",
-     &I::genCUDALDXXFunc<__ldcs_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_i8x2",
-     &I::genCUDALDXXFunc<__ldcs_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r2x2",
-     &I::genCUDALDXXFunc<__ldcs_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r4x4",
-     &I::genCUDALDXXFunc<__ldcs_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r8x2",
-     &I::genCUDALDXXFunc<__ldcs_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_i4x4",
-     &I::genCUDALDXXFunc<__ldcv_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_i8x2",
-     &I::genCUDALDXXFunc<__ldcv_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r2x2",
-     &I::genCUDALDXXFunc<__ldcv_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r4x4",
-     &I::genCUDALDXXFunc<__ldcv_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r8x2",
-     &I::genCUDALDXXFunc<__ldcv_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_i4x4",
-     &I::genCUDALDXXFunc<__ldlu_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_i8x2",
-     &I::genCUDALDXXFunc<__ldlu_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r2x2",
-     &I::genCUDALDXXFunc<__ldlu_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r4x4",
-     &I::genCUDALDXXFunc<__ldlu_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r8x2",
-     &I::genCUDALDXXFunc<__ldlu_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
     {"abort", &I::genAbort},
     {"abs", &I::genAbs},
     {"achar", &I::genChar},
@@ -263,10 +135,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAll,
      {{{"mask", asAddr}, {"dim", asValue}}},
      /*isElemental=*/false},
-    {"all_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
     {"allocated",
      &I::genAllocated,
      {{{"array", asInquired}, {"scalar", asInquired}}},
@@ -276,10 +144,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAny,
      {{{"mask", asAddr}, {"dim", asValue}}},
      /*isElemental=*/false},
-    {"any_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
     {"asind", &I::genAsind},
     {"asinpi", &I::genAsinpi},
     {"associated",
@@ -290,103 +154,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
-    {"atomicadd_r4x2",
-     &I::genAtomicAddVector<2>,
-     {{{"a", asAddr}, {"v", asAddr}}},
-     false},
-    {"atomicadd_r4x4",
-     &I::genAtomicAddVector<4>,
-     {{{"a", asAddr}, {"v", asAddr}}},
-     false},
-    {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddr2",
-     &I::genAtomicAddR2,
-     {{{"a", asAddr}, {"v", asAddr}}},
-     false},
-    {"atomicaddvector_r2x2",
-     &I::genAtomicAddVector<2>,
-     {{{"a", asAddr}, {"v", asAddr}}},
-     false},
-    {"atomicaddvector_r4x2",
-     &I::genAtomicAddVector<2>,
-     {{{"a", asAddr}, {"v", asAddr}}},
-     false},
-    {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomiccasd",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasf",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasi",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasul",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicexchd",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchf",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchi",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchul",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"ballot_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_arrive",
-     &I::genBarrierArrive,
-     {{{"barrier", asAddr}}},
-     /*isElemental=*/false},
-    {"barrier_arrive_cnt",
-     &I::genBarrierArriveCnt,
-     {{{"barrier", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_init",
-     &I::genBarrierInit,
-     {{{"barrier", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_try_wait",
-     &I::genBarrierTryWait,
-     {{{"barrier", asAddr}, {"token", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_try_wait_sleep",
-     &I::genBarrierTryWaitSleep,
-     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
-     /*isElemental=*/false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -430,11 +197,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genChdir,
      {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false},
-    {"clock64",
-     &I::genNVVMTime<mlir::NVVM::Clock64Op>,
-     {},
-     /*isElemental=*/false},
     {"cmplx",
      &I::genCmplx,
      {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
@@ -531,10 +293,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genExtendsTypeOf,
      {{{"a", asBox}, {"mold", asBox}}},
      /*isElemental=*/false},
-    {"fence_proxy_async",
-     &I::genFenceProxyAsync,
-     {},
-     /*isElemental=*/false},
     {"findloc",
      &I::genFindloc,
      {{{"array", asBox},
@@ -589,10 +347,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"getgid", &I::genGetGID},
     {"getpid", &I::genGetPID},
     {"getuid", &I::genGetUID},
-    {"globaltimer",
-     &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>,
-     {},
-     /*isElemental=*/false},
     {"hostnm",
      &I::genHostnm,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -760,38 +514,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"malloc", &I::genMalloc},
     {"maskl", &I::genMask<mlir::arith::ShLIOp>},
     {"maskr", &I::genMask<mlir::arith::ShRUIOp>},
-    {"match_all_syncjd",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjf",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjj",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjx",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_any_syncjd",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjf",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjj",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjx",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
     {"matmul",
      &I::genMatmul,
      {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}},
@@ -1017,20 +739,6 @@ static constexpr IntrinsicHandler handlers[]{
        {"dim", asValue},
        {"mask", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false},
-    {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_count_i4",
-     &I::genSyncThreadsCount,
-     {},
-     /*isElemental=*/false},
-    {"syncthreads_count_l4",
-     &I::genSyncThreadsCount,
-     {},
-     /*isElemental=*/false},
-    {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
-    {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
-    {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false},
     {"system",
      &I::genSystem,
      {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}},
@@ -1041,115 +749,13 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"tand", &I::genTand},
     {"tanpi", &I::genTanpi},
-    {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false},
     {"this_image",
      &I::genThisImage,
      {{{"coarray", asBox},
        {"dim", asAddr},
        {"team", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false},
-    {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false},
-    {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false},
-    {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false},
-    {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false},
     {"time", &I::genTime, {}, /*isElemental=*/false},
-    {"tma_bulk_commit_group",
-     &I::genTMABulkCommitGroup,
-     {{}},
-     /*isElemental=*/false},
-    {"tma_bulk_g2s",
-     &I::genTMABulkG2S,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nbytes", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldc4",
-     &I::genTMABulkLoadC4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldc8",
-     &I::genTMABulkLoadC8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldi4",
-     &I::genTMABulkLoadI4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldi8",
-     &I::genTMABulkLoadI8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr2",
-     &I::genTMABulkLoadR2,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr4",
-     &I::genTMABulkLoadR4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr8",
-     &I::genTMABulkLoadR8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_s2g",
-     &I::genTMABulkS2G,
-     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_c4",
-     &I::genTMABulkStoreC4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_c8",
-     &I::genTMABulkStoreC8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_i4",
-     &I::genTMABulkStoreI4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_i8",
-     &I::genTMABulkStoreI8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r2",
-     &I::genTMABulkStoreR2,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r4",
-     &I::genTMABulkStoreR4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r8",
-     &I::genTMABulkStoreR8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_wait_group",
-     &I::genTMABulkWaitGroup,
-     {{}},
-     /*isElemental=*/false},
     {"trailz", &I::genTrailz},
     {"transfer",
      &I::genTransfer,
@@ -2241,6 +1847,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder,
   if (isPPCTarget)
     if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
       return std::make_optional<IntrinsicHandlerEntry>(ppcHandler);
+  // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled.
+  if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+    return std::make_optional<IntrinsicHandlerEntry>(cudaHandler);
   // Subroutines should have a handler.
   if (!resultType)
     return std::nullopt;
@@ -3127,244 +2736,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   return mlir::arith::MulFOp::create(builder, loc, atan, factor);
 }
 
-static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
-                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
-                                mlir::Value arg1) {
-  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
-  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
-                                         mlir::LLVM::AtomicOrdering::seq_cst);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::add
-          : mlir::LLVM::AtomicBinOp::fadd;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
-                                 llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-
-  mlir::Value a = fir::getBase(args[0]);
-
-  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
-    a = fir::BoxAddrOp::create(builder, loc, a);
-  }
-
-  auto loc = builder.getUnknownLoc();
-  auto f16Ty = builder.getF16Type();
-  auto i32Ty = builder.getI32Type();
-  auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
-  mlir::Type idxTy = builder.getIndexType();
-  auto f16RefTy = fir::ReferenceType::get(f16Ty);
-  auto zero = builder.createIntegerConstant(loc, idxTy, 0);
-  auto one = builder.createIntegerConstant(loc, idxTy, 1);
-  auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
-                                           fir::getBase(args[1]), zero);
-  auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
-                                           fir::getBase(args[1]), one);
-  auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
-  auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
-  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
-  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
-      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
-  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
-      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
-  auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
-  auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
-  mlir::Value vecI32 =
-      mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
-  return mlir::vector::ExtractOp::create(builder, loc, vecI32,
-                                         mlir::ArrayRef<int64_t>{0});
-}
-
-template <int extent>
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType,
-                                     llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value res = fir::AllocaOp::create(
-      builder, loc, fir::SequenceType::get({extent}, resultType));
-  mlir::Value a = fir::getBase(args[0]);
-  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
-    a = fir::BoxAddrOp::create(builder, loc, a);
-  }
-  auto vecTy = mlir::VectorType::get({extent}, resultType);
-  auto refTy = fir::ReferenceType::get(resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-  mlir::Type idxTy = builder.getIndexType();
-
-  // Extract the values from the array.
-  llvm::SmallVector<mlir::Value> values;
-  for (unsigned i = 0; i < extent; ++i) {
-    mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
-    mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
-                                                  fir::getBase(args[1]), pos);
-    mlir::Value value = fir::LoadOp::create(builder, loc, coord);
-    values.push_back(value);
-  }
-  // Pack extracted values into a vector to call the atomic add.
-  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
-  for (unsigned i = 0; i < extent; ++i) {
-    mlir::Value insert = mlir::LLVM::InsertElementOp::create(
-        builder, loc, undef, values[i],
-        builder.createIntegerConstant(loc, i32Ty, i));
-    undef = insert;
-  }
-  // Atomic operation with a vector of values.
-  mlir::Value add =
-      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
-  // Store results in the result array.
-  for (unsigned i = 0; i < extent; ++i) {
-    mlir::Value r = mlir::LLVM::ExtractElementOp::create(
-        builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
-    mlir::Value c = fir::CoordinateOp::create(
-        builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
-    fir::StoreOp::create(builder, loc, r, c);
-  }
-  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
-  return fir::ArrayBoxValue(res, {ext});
-}
-
-mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::sub
-          : mlir::LLVM::AtomicBinOp::fsub;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICCAS
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
-                               llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
-  auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
-
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  mlir::Value arg2 = fir::getBase(args[2]);
-
-  auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
-    if (mlir::isa<mlir::Float32Type>(arg.getType()))
-      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
-                                           arg);
-    if (mlir::isa<mlir::Float64Type>(arg.getType()))
-      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
-                                           arg);
-    return arg;
-  };
-
-  arg1 = bitCastFloat(arg1);
-  arg2 = bitCastFloat(arg2);
-
-  if (arg1.getType() != arg2.getType()) {
-    // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
-    arg2 = builder.createConvert(loc, arg1.getType(), arg2);
-  }
-
-  auto address =
-      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
-          .getResult(0);
-  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
-      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
-  mlir::Value boolResult =
-      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
-  return builder.createConvert(loc, resultType, boolResult);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICEXCH
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicExch(mlir::Type resultType,
-                                llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  assert(arg1.getType().isIntOrFloat());
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
-  return genAtomBinOp(builder, loc, binOp, arg0, arg1);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::max
-          : mlir::LLVM::AtomicBinOp::fmax;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::min
-          : mlir::LLVM::AtomicBinOp::fmin;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICXOR
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicXor(mlir::Type resultType,
-                               llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
-}
-
 // ASSOCIATED
 fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
@@ -3416,118 +2787,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
   return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
 }
 
-static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
-                                         mlir::Location loc,
-                                         mlir::Value barrier,
-                                         mlir::NVVM::NVVMMemorySpace space) {
-  mlir::Value llvmPtr = fir::ConvertOp::create(
-      builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
-      barrier);
-  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
-      builder, loc,
-      mlir::LLVM::LLVMPointerType::get(builder.getContext(),
-                                       static_cast<unsigned>(space)),
-      llvmPtr);
-  return addrCast;
-}
-
-// BARRIER_ARRIVE (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
-                                   llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 1);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
-                                                    barrier)
-      .getResult();
-}
-
-// BARRIER_ARRIBVE_CNT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
-                                      llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
-                                         {barrier, args[1]}, {},
-                                         "mbarrier.arrive.expect_tx.release."
-                                         "cta.shared::cta.b64 %0, [%1], %2;",
-                                         {})
-      .getResult(0);
-}
-
-// BARRIER_INIT (CUDA)
-void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
-                                     fir::getBase(args[1]), {});
-  auto kind = mlir::NVVM::ProxyKindAttr::get(
-      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
-  auto space = mlir::NVVM::SharedSpaceAttr::get(
-      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
-  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
-// BARRIER_TRY_WAIT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
-                                    llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
-  fir::StoreOp::create(builder, loc, zero, res);
-  mlir::Value ns =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
-  mlir::Value load = fir::LoadOp::create(builder, loc, res);
-  auto whileOp = mlir::scf::WhileOp::create(
-      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
-  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
-  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
-  builder.setInsertionPointToStart(beforeBlock);
-  mlir::Value condition = mlir::arith::CmpIOp::create(
-      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
-  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
-  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
-  afterBlock->addArgument(resultType, loc);
-  builder.setInsertionPointToStart(afterBlock);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
-                        builder, loc, {resultType}, {barrier, args[1], ns}, {},
-                        "{\n"
-                        "  .reg .pred p;\n"
-                        "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
-                        "  selp.b32 %0, 1, 0, p;\n"
-                        "}",
-                        {})
-                        .getResult(0);
-  mlir::scf::YieldOp::create(builder, loc, ret);
-  builder.setInsertionPointAfter(whileOp);
-  return whileOp.getResult(0);
-}
-
-// BARRIER_TRY_WAIT_SLEEP (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
-                                         llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 3);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  return mlir::NVVM::InlinePtxOp::create(
-             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
-             "{\n"
-             "  .reg .pred p;\n"
-             "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
-             "  selp.b32 %0, 1, 0, p;\n"
-             "}",
-             {})
-      .getResult(0);
-}
-
 // BESSEL_JN
 fir::ExtendedValue
 IntrinsicLibrary::genBesselJn(mlir::Type resultType,
@@ -4261,30 +3520,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT");
 }
 
-// __LDCA, __LDCS, __LDLU, __LDCV
-template <const char *fctName, int extent>
-fir::ExtendedValue
-IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
-                                  llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 1);
-  mlir::Type resTy = fir::SequenceType::get(extent, resultType);
-  mlir::Value arg = fir::getBase(args[0]);
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
-  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
-    arg = fir::BoxAddrOp::create(builder, loc, arg);
-  mlir::Type refResTy = fir::ReferenceType::get(resTy);
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
-  auto funcOp = builder.createFunction(loc, fctName, ftype);
-  llvm::SmallVector<mlir::Value> funcArgs;
-  funcArgs.push_back(res);
-  funcArgs.push_back(arg);
-  fir::CallOp::create(builder, loc, funcOp, funcArgs);
-  mlir::Value ext =
-      builder.createIntegerConstant(loc, builder.getIndexType(), extent);
-  return fir::ArrayBoxValue(res, {ext});
-}
-
 // DATE_AND_TIME
 void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 4 && "date_and_time has 4 args");
@@ -4617,17 +3852,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
                                      fir::getBase(args[1])));
 }
 
-// FENCE_PROXY_ASYNC (CUDA)
-void IntrinsicLibrary::genFenceProxyAsync(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  auto kind = mlir::NVVM::ProxyKindAttr::get(
-      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
-  auto space = mlir::NVVM::SharedSpaceAttr::get(
-      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
-  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
 // FINDLOC
 fir::ExtendedValue
 IntrinsicLibrary::genFindloc(mlir::Type resultType,
@@ -7138,67 +6362,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
   return result;
 }
 
-// MATCH_ALL_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
-                                  llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 3);
-  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
-  mlir::Type i1Ty = builder.getI1Type();
-  mlir::MLIRContext *context = builder.getContext();
-
-  mlir::Value arg1 = args[1];
-  if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = fir::ConvertOp::create(
-        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
-  mlir::Type retTy =
-      mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
-  auto match =
-      mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
-                                      mlir::NVVM::MatchSyncKind::all)
-          .getResult();
-  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
-  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
-  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
-  fir::StoreOp::create(builder, loc, conv, args[2]);
-  return value;
-}
-
-// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
-template <mlir::NVVM::VoteSyncKind kind>
-mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value arg1 =
-      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
-  mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
-                         ? builder.getI32Type()
-                         : builder.getI1Type();
-  auto voteRes =
-      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
-          .getResult();
-  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
-}
-
-// MATCH_ANY_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
-                                  llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
-  mlir::Value arg1 = args[1];
-  if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = fir::ConvertOp::create(
-        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
-  return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
-                                         arg1, mlir::NVVM::MatchSyncKind::any)
-      .getResult();
-}
-
 // MATMUL
 fir::ExtendedValue
 IntrinsicLibrary::genMatmul(mlir::Type resultType,
@@ -7816,14 +6979,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType,
   return mif::NumImagesOp::create(builder, loc).getResult();
 }
 
-// CLOCK, CLOCK64, GLOBALTIMER
-template <typename OpTy>
-mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0 && "expect no arguments");
-  return OpTy::create(builder, loc, resultType).getResult();
-}
-
 // PACK
 fir::ExtendedValue
 IntrinsicLibrary::genPack(mlir::Type resultType,
@@ -8798,92 +7953,6 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType,
   return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
 }
 
-// THIS_GRID
-mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-
-  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
-  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
-  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
-
-  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
-
-  // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
-  // (blockDim.x * gridDim.x);
-  mlir::Value resZ =
-      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
-  mlir::Value resY =
-      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
-  mlir::Value resX =
-      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
-  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
-
-  // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
-  //   blockIdx.x;
-  // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
-  //   ((threadIdx.z * blockDim.y) * blockDim.x) +
-  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value r1 =
-      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
-  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
-  mlir::Value r3 =
-      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
-  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
-  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
-
-  mlir::Value bXbY =
-      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
-  mlir::Value bXbYbZ =
-      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
-  mlir::Value tZbY =
-      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
-  mlir::Value tZbYbX =
-      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
-  mlir::Value tYbX =
-      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
-  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
 // THIS_IMAGE
 fir::ExtendedValue
 IntrinsicLibrary::genThisImage(mlir::Type resultType,
@@ -8899,99 +7968,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType,
   return builder.createConvert(loc, resultType, res);
 }
 
-// THIS_THREAD_BLOCK
-mlir::Value
-IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
-                                     llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
-  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
-  mlir::Value size =
-      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
-  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
-
-  // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
-  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-  mlir::Value r1 =
-      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
-  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
-  mlir::Value r3 =
-      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
-  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
-  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
-// THIS_WARP
-mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  // coalesced_group%size = 32
-  mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  // coalesced_group%rank = threadIdx.x & 31 + 1
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  mlir::Value masked =
-      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
-  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
 // TRAILZ
 mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType,
                                         llvm::ArrayRef<mlir::Value> args) {
@@ -9213,65 +8189,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType,
                       resultType, args);
 }
 
-// SYNCTHREADS
-void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) {
-  mlir::NVVM::Barrier0Op::create(builder, loc);
-}
-
-// SYNCTHREADS_AND
-mlir::Value
-IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
-                                    llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_COUNT
-mlir::Value
-IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
-                                      llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_OR
-mlir::Value
-IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
-                                   llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCWARP
-void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 1);
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
-  mlir::Value mask = fir::getBase(args[0]);
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> argsList{mask};
-  fir::CallOp::create(builder, loc, funcOp, argsList);
-}
-
 // SYSTEM
 fir::ExtendedValue
 IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType,
@@ -9403,38 +8320,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE");
 }
 
-// THREADFENCE
-void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_BLOCK
-void IntrinsicLibrary::genThreadFenceBlock(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_SYSTEM
-void IntrinsicLibrary::genThreadFenceSystem(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
 // TIME
 mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
@@ -9443,226 +8328,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
                                fir::runtime::genTime(builder, loc));
 }
 
-// TMA_BULK_COMMIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkCommitGroup(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
-}
-
-// TMA_BULK_G2S (CUDA)
-void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value dst =
-      convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
-                            mlir::NVVM::NVVMMemorySpace::SharedCluster);
-  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
-                                          mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
-      builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
-}
-
-static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
-                           mlir::Value barrier, mlir::Value src,
-                           mlir::Value dst, mlir::Value nelem,
-                           mlir::Value eleSize) {
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
-  dst = builder.createConvert(loc, llvmPtrTy, dst);
-  src = builder.createConvert(loc, llvmPtrTy, src);
-  mlir::NVVM::InlinePtxOp::create(
-      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
-      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
-      "[%1], %2, [%3];",
-      {});
-  mlir::NVVM::InlinePtxOp::create(
-      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
-      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
-}
-
-// TMA_BULK_LOADC4
-void IntrinsicLibrary::genTMABulkLoadC4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADC8
-void IntrinsicLibrary::genTMABulkLoadC8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI4
-void IntrinsicLibrary::genTMABulkLoadI4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI8
-void IntrinsicLibrary::genTMABulkLoadI8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR2
-void IntrinsicLibrary::genTMABulkLoadR2(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR4
-void IntrinsicLibrary::genTMABulkLoadR4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR8
-void IntrinsicLibrary::genTMABulkLoadR8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_S2G (CUDA)
-void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
-                                          mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
-                                          mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
-      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
-
-  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group;", {});
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
-                                             builder.getI32IntegerAttr(0), {});
-}
-
-static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value src, mlir::Value dst, mlir::Value count,
-                            mlir::Value eleSize) {
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
-  src = convertPtrToNVVMSpace(builder, loc, src,
-                              mlir::NVVM::NVVMMemorySpace::Shared);
-  dst = convertPtrToNVVMSpace(builder, loc, dst,
-                              mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
-                                                     size, {}, {});
-  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group;", {});
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
-                                             builder.getI32IntegerAttr(0), {});
-}
-
-// TMA_BULK_STORE_C4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_C8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R2 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR2(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_WAIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkWaitGroup(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
-}
-
 // TRIM
 fir::ExtendedValue
 IntrinsicLibrary::genTrim(mlir::Type resultType,
@@ -10077,6 +8742,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) {
   if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
     if (!ppcHandler->argLoweringRules.hasDefaultRules())
       return &ppcHandler->argLoweringRules;
+  if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+    if (!cudaHandler->argLoweringRules.hasDefaultRules())
+      return &cudaHandler->argLoweringRules;
   return nullptr;
 }
 

From 6c9b5943f846839f8a188dd134a5a5a140d0c348 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 4 Nov 2025 18:47:55 -0800
Subject: [PATCH 267/313] [MLIR] Fix generate-test-checks.py to not remove
 every blank lines (#166493)

The stripping of the notes was done on a line-by-line basis which was
fragile and led to remove empty lines everywhere in the file. Instead we
can strip it as a single block before splitting the input into multiple
lines.
---
 mlir/utils/generate-test-checks.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index 3712a6b9c963d..22774468bb403 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -230,14 +230,11 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re
 
 
 # Process the source file lines. The source file doesn't have to be .mlir.
-def process_source_lines(source_lines, note, args):
+def process_source_lines(source_lines, args):
     source_split_re = re.compile(args.source_delim_regex)
 
     source_segments = [[]]
     for line in source_lines:
-        # Remove previous note.
-        if line in note:
-            continue
         # Remove previous CHECK lines.
         if line.find(args.check_prefix) != -1:
             continue
@@ -359,9 +356,10 @@ def main():
 
     source_segments = None
     if args.source:
-        source_segments = process_source_lines(
-            [l.rstrip() for l in open(args.source, "r")], autogenerated_note, args
-        )
+        with open(args.source, "r") as f:
+            raw_source = f.read().replace(autogenerated_note, "")
+            raw_source_lines = [l.rstrip() for l in raw_source.splitlines()]
+        source_segments = process_source_lines(raw_source_lines, args)
 
     if args.inplace:
         assert args.output is None

From a2977dea61d7eec82a8e7623b015bf695b7ef773 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Wed, 5 Nov 2025 05:12:17 +0200
Subject: [PATCH 268/313] [libc++][NFC] Removed unsupported compilers from
 tests (#166403)

---
 .../atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp  | 2 +-
 .../new.delete/new.delete.array/sized_delete_array.pass.cpp    | 1 -
 .../new.delete/new.delete.single/sized_delete.pass.cpp         | 1 -
 libcxx/test/std/numerics/c.math/signbit.pass.cpp               | 2 +-
 .../std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp  | 3 ---
 .../std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp  | 3 ---
 .../std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp  | 3 ---
 .../numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp         | 3 ---
 .../std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp  | 3 ---
 .../util.smartptr.shared.create/shared_ptr_array.pass.cpp      | 1 -
 .../std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp    | 2 +-
 .../meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp | 2 +-
 12 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
index 13bd761ae9808..602bd1612015d 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
@@ -9,7 +9,7 @@
 // https://github.com/llvm/llvm-project/issues/30023
 // compare exchange does not work with types of which the size is not a power of 2
 
-// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17
+// XFAIL: clang-20, clang-21, apple-clang-17
 // UNSUPPORTED: c++03
 
 // TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 731d751df08d9..dc4d8ae2851f4 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 64a26ed63e8ce..834c01b2272e2 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 7571ced2e4431..233e8ed2338b6 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // GCC warns about signbit comparing `bool_v < 0`, which we're testing
 // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
index 6bd112c7d1280..f49e19acf0234 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
index bdfc57694dd53..0789213163847 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
index 1fe7916c67823..f09bf30771102 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
index b797ae7533add..86e2e61647be8 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class R, class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
index 8b6188f1fad0e..c2be8c5a47bdf 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
index f71f688afc52e..bb0b2d322218d 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
@@ -10,7 +10,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index 2e0b1e9025a61..b6b226c7f79d8 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 
 // These compilers don't support __builtin_is_virtual_base_of yet.
-// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 1ca9d44b82afe..f43693c08bc39 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // <type_traits>
 

From 9cd1e4067873ac9d4348bae9de32a34575d1d64f Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Tue, 4 Nov 2025 22:22:24 -0500
Subject: [PATCH 269/313] [HLSL] Layout Initalizer list in Column order via
 index conversion (#166277)

fixes #165663

The bug was that we were using the initalizer lists index to populate
the matrix. This meant that [0..n] would coorelate to [0..n] indicies of
the flattened matrix. Hence why we were seeing the Row-major order: [ 0
1 2 3 4 5 ]. To fix this we can simply converted these indicies to the
Column-major order: [ 0 3 1 4 2 5 ].

The net effect of this is the layout of the matrix is now correct and we
don't need to change the MatrixSubscriptExpr indexing scheme.

---------

Co-authored-by: Deric C. <cheung.deric@gmail.com>
Co-authored-by: Helena Kotas <hekotas@microsoft.com>
---
 clang/lib/Sema/SemaInit.cpp                   | 27 +++---
 clang/test/AST/HLSL/matrix-constructors.hlsl  | 91 +++++++++---------
 .../AST/HLSL/matrix-general-initializer.hlsl  | 86 ++++++++---------
 .../BasicFeatures/MatrixConstructor.hlsl      | 95 +++++++++++++++++++
 4 files changed, 198 insertions(+), 101 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 073010d16b428..cc6ddf568d346 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1897,26 +1897,29 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity,
     return;
 
   const ConstantMatrixType *MT = DeclType->castAs<ConstantMatrixType>();
+
+  // For HLSL, the error reporting for this case is handled in SemaHLSL's
+  // initializer list diagnostics. That means the execution should require
+  // getNumElementsFlattened to equal getNumInits. In other words the execution
+  // should never reach this point if this condition is not true".
+  assert(IList->getNumInits() == MT->getNumElementsFlattened() &&
+         "Inits must equal Matrix element count");
+
   QualType ElemTy = MT->getElementType();
-  const unsigned MaxElts = MT->getNumElementsFlattened();
 
-  unsigned NumEltsInit = 0;
+  Index = 0;
   InitializedEntity ElemEnt =
       InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);
 
-  while (NumEltsInit < MaxElts && Index < IList->getNumInits()) {
+  while (Index < IList->getNumInits()) {
     // Not a sublist: just consume directly.
-    ElemEnt.setElementIndex(Index);
-    CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList,
+    unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() +
+                             (Index / MT->getNumRows());
+    ElemEnt.setElementIndex(ColMajorIndex);
+    CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList,
                         StructuredIndex);
-    ++NumEltsInit;
+    ++Index;
   }
-
-  // For HLSL The error for this case is handled in SemaHLSL's initializer
-  // list diagnostics, That means the execution should require NumEltsInit
-  // to equal Max initializers. In other words  execution should never
-  // reach this point if this condition is not true".
-  assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts");
 }
 
 void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl
index 0a2f03c7c0fac..e1a9c53e2c602 100644
--- a/clang/test/AST/HLSL/matrix-constructors.hlsl
+++ b/clang/test/AST/HLSL/matrix-constructors.hlsl
@@ -9,21 +9,20 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 
 [numthreads(1,1,1)]
 void ok() {
-
 // CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} A 'float2x3':'matrix<float, 2, 3>' cinit
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 A = float2x3(1,2,3,4,5,6);
 
@@ -57,6 +56,8 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
@@ -68,12 +69,10 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6);
 
@@ -97,9 +96,9 @@ void ok() {
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
@@ -107,10 +106,12 @@ void ok() {
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
@@ -120,9 +121,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6);
@@ -158,7 +157,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
@@ -172,7 +171,9 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
@@ -186,9 +187,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 F = float2x3(float4(1,2,3,4), 5, 6);
@@ -202,10 +201,10 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
@@ -215,41 +214,41 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
 float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);  
@@ -262,13 +261,13 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
   float2 Vec2 = float2(1.0, 2.0);   
   float2x2 H = float2x2(Vec2,3,4);
@@ -281,10 +280,10 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int'
@@ -300,15 +299,15 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
   struct S { float2 f;  float a;} s;
   float2x2 J = float2x2(s.f, s.a, s.a);
@@ -317,8 +316,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
   typedef float2x2 second_level_of_typedefs;
   second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f);
@@ -327,8 +326,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>' functional cast to second_level_of_typedefs <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>'
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
   float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f);
 
@@ -367,12 +366,12 @@ float2x1 GettingStrange = float2x1(s2, s2);
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
index 14c950acb7baf..1a631113eb0f0 100644
--- a/clang/test/AST/HLSL/matrix-general-initializer.hlsl
+++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
@@ -26,14 +26,6 @@ void ok() {
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
@@ -42,20 +34,8 @@ void ok() {
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' x
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
@@ -66,10 +46,30 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' x
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 
@@ -84,12 +84,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -105,12 +105,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -138,7 +138,7 @@ S s = {m};
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
@@ -146,7 +146,7 @@ S s = {m};
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
@@ -169,25 +169,25 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
@@ -199,26 +199,26 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -229,25 +229,25 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
new file mode 100644
index 0000000000000..a7c01015b2015
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case1v(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <6 x float> <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00>
+//
+float3x2 case1() {
+  // vec[0] = 0
+  // vec[1] = 2
+  // vec[2] = 4
+  // vec[3] = 1
+  // vec[4] = 3
+  // vec[5] = 5
+  return float3x2(0, 1, 
+                  2, 3,
+                  4, 5);
+}
+
+
+RWStructuredBuffer<float> In;
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[CALL]], align 4
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4
+// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4
+// CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5
+// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+//
+float3x2 case2() {
+  // vec[0] = Call
+  // vec[1] = Call2
+  // vec[2] = Call4
+  // vec[3] = Call1
+  // vec[4] = Call3
+  // vec[5] = Call5
+  return float3x2(In[0], In[1], 
+                  In[2], In[3],
+                  In[4], In[5]);
+}
+
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case3Dv3_fS_(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[A:%.*]], <3 x float> noundef nofpclass(nan inf) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    store <3 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <3 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
+// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5
+// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+//
+float3x2 case3(float3 a, float3 b) {
+ // vec[0] = A[0]
+ // vec[1] = A[2]
+ // vec[2] = B[1]
+ // vec[3] = A[1]
+ // vec[4] = B[0]
+ // vec[5] = B[2]
+  return float3x2(a,b);
+}

From 0c73009236384033fc65be4fefb3c3b9b0d19c77 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Wed, 5 Nov 2025 06:24:53 +0300
Subject: [PATCH 270/313] [WebAssembly] TableGen-erate SDNode descriptions
 (#166259)

This allows SDNodes to be validated against their expected type profiles
and reduces the number of changes required to add a new node.

CALL and RET_CALL do not have a description in td files, and it is not
currently possible to add one as these nodes have both variable operands
and variable results.

This also fixes a subtle bug detected by the enabled verification
functionality. `LOCAL_GET` is declared with `SDNPHasChain` property, and
thus should have both a chain operand and a chain result. The original
code created a node without a chain result, which caused a check in
`SDNodeInfo::verifyNode()` to fail.

Part of #119709.

Pull Request: https://github.com/llvm/llvm-project/pull/166259
---
 llvm/lib/Target/WebAssembly/CMakeLists.txt    |  1 +
 .../lib/Target/WebAssembly/WebAssemblyISD.def | 64 -------------------
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 21 +-----
 .../WebAssembly/WebAssemblyISelLowering.h     | 12 ----
 .../WebAssemblySelectionDAGInfo.cpp           | 24 ++++---
 .../WebAssembly/WebAssemblySelectionDAGInfo.h | 17 ++++-
 6 files changed, 34 insertions(+), 105 deletions(-)
 delete mode 100644 llvm/lib/Target/WebAssembly/WebAssemblyISD.def

diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index 1e83cbeac50d6..17df119d62709 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(WebAssemblyCommonTableGen)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
deleted file mode 100644
index 23108e429eda8..0000000000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ /dev/null
@@ -1,64 +0,0 @@
-//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file describes the various WebAssembly ISD node types.
-///
-//===----------------------------------------------------------------------===//
-
-// NOTE: NO INCLUDE GUARD DESIRED!
-
-HANDLE_NODETYPE(CALL)
-HANDLE_NODETYPE(RET_CALL)
-HANDLE_NODETYPE(RETURN)
-HANDLE_NODETYPE(ARGUMENT)
-HANDLE_NODETYPE(LOCAL_GET)
-HANDLE_NODETYPE(LOCAL_SET)
-// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
-HANDLE_NODETYPE(Wrapper)
-// A special node for TargetGlobalAddress used in PIC code for
-// __memory_base/__table_base relative access.
-HANDLE_NODETYPE(WrapperREL)
-HANDLE_NODETYPE(BR_IF)
-HANDLE_NODETYPE(BR_TABLE)
-HANDLE_NODETYPE(DOT)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S)
-HANDLE_NODETYPE(SHUFFLE)
-HANDLE_NODETYPE(SWIZZLE)
-HANDLE_NODETYPE(VEC_SHL)
-HANDLE_NODETYPE(VEC_SHR_S)
-HANDLE_NODETYPE(VEC_SHR_U)
-HANDLE_NODETYPE(NARROW_U)
-HANDLE_NODETYPE(EXTEND_LOW_S)
-HANDLE_NODETYPE(EXTEND_LOW_U)
-HANDLE_NODETYPE(EXTEND_HIGH_S)
-HANDLE_NODETYPE(EXTEND_HIGH_U)
-HANDLE_NODETYPE(CONVERT_LOW_S)
-HANDLE_NODETYPE(CONVERT_LOW_U)
-HANDLE_NODETYPE(PROMOTE_LOW)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
-HANDLE_NODETYPE(DEMOTE_ZERO)
-HANDLE_NODETYPE(I64_ADD128)
-HANDLE_NODETYPE(I64_SUB128)
-HANDLE_NODETYPE(I64_MUL_WIDE_S)
-HANDLE_NODETYPE(I64_MUL_WIDE_U)
-
-// Memory intrinsics
-HANDLE_NODETYPE(GLOBAL_GET)
-HANDLE_NODETYPE(GLOBAL_SET)
-HANDLE_NODETYPE(TABLE_GET)
-HANDLE_NODETYPE(TABLE_SET)
-
-// Bulk memory instructions. These follow LLVM's expected semantics of
-// supporting out-of-bounds pointers if the length is zero, by inserting
-// a branch around Wasm's `memory.copy` and `memory.fill`, which would
-// otherwise trap.
-HANDLE_NODETYPE(MEMCPY)
-HANDLE_NODETYPE(MEMSET)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7ec463bdc3b84..af322982d5355 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -942,20 +942,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter(
   }
 }
 
-const char *
-WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  case WebAssemblyISD::FIRST_NUMBER:
-    break;
-#define HANDLE_NODETYPE(NODE)                                                  \
-  case WebAssemblyISD::NODE:                                                   \
-    return "WebAssemblyISD::" #NODE;
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-  }
-  return nullptr;
-}
-
 std::pair<unsigned, const TargetRegisterClass *>
 WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
@@ -1830,11 +1816,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op,
 
     SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32);
     EVT LocalVT = LN->getValueType(0);
-    SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT,
-                                   {LN->getChain(), Idx});
-    SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL);
-    assert(Result->getNumValues() == 2 && "Loads must carry a chain!");
-    return Result;
+    return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other},
+                       {LN->getChain(), Idx});
   }
 
   if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace()))
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 472ec678534a4..f7052989b3c75 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -19,17 +19,6 @@
 
 namespace llvm {
 
-namespace WebAssemblyISD {
-
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-#define HANDLE_NODETYPE(NODE) NODE,
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-};
-
-} // end namespace WebAssemblyISD
-
 class WebAssemblySubtarget;
 
 class WebAssemblyTargetLowering final : public TargetLowering {
@@ -53,7 +42,6 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
-  const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 2673c81eae40b..cf5cc41ea565b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -11,23 +11,31 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "WebAssemblySelectionDAGInfo.h"
 #include "WebAssemblyTargetMachine.h"
+
+#define GET_SDNODE_DESC
+#include "WebAssemblyGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-selectiondag-info"
 
+WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {}
+
 WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor
 
-bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
+const char *
+WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
   switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  default:
-    return false;
-  case WebAssemblyISD::GLOBAL_GET:
-  case WebAssemblyISD::GLOBAL_SET:
-  case WebAssemblyISD::TABLE_GET:
-  case WebAssemblyISD::TABLE_SET:
-    return true;
+  case WebAssemblyISD::CALL:
+    return "WebAssemblyISD::CALL";
+  case WebAssemblyISD::RET_CALL:
+    return "WebAssemblyISD::RET_CALL";
   }
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 69c9af0966308..8775f4946d88d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -17,13 +17,26 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "WebAssemblyGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace WebAssemblyISD {
+
+enum NodeType : unsigned {
+  CALL = GENERATED_OPCODE_END,
+  RET_CALL,
+};
 
-class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
+} // namespace WebAssemblyISD
+
+class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo {
 public:
+  WebAssemblySelectionDAGInfo();
+
   ~WebAssemblySelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,

From 952d4b4c0bc959afe6bf18a7550fab024ab5a9b8 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Wed, 5 Nov 2025 11:35:11 +0800
Subject: [PATCH 271/313] [CIR] Fix assignment ignore in ScalarExprEmitter
 (#166118)

We are missing a couple of cases were we are not supposed to ignore
assignment results but did so, which results in compiler crashes. Fix
that.

Also start ignoring IgnoredExprs unless there's side effects (assignments) inside.
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp       |   2 +-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  23 ++++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h     |   3 +-
 clang/test/CIR/CodeGen/binassign.c         | 104 +++++++++++++++++++++
 4 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5ccb431e626ae..4fb178df0e508 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1630,7 +1630,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot,
                                    bool ignoreResult) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
-    return RValue::get(emitScalarExpr(e));
+    return RValue::get(emitScalarExpr(e, ignoreResult));
   case cir::TEK_Complex:
     return RValue::getComplex(emitComplexExpr(e));
   case cir::TEK_Aggregate: {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 119314fe27dce..5eba5ba6c3df1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -78,11 +78,15 @@ struct BinOpInfo {
 class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   CIRGenFunction &cgf;
   CIRGenBuilderTy &builder;
+  // Unlike classic codegen we set this to false or use std::exchange to read
+  // the value instead of calling TestAndClearIgnoreResultAssign to make it
+  // explicit when the value is used
   bool ignoreResultAssign;
 
 public:
-  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder)
-      : cgf(cgf), builder(builder) {}
+  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+                    bool ignoreResultAssign = false)
+      : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {}
 
   //===--------------------------------------------------------------------===//
   //                               Utilities
@@ -221,6 +225,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   }
 
   mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) {
+    ignoreResultAssign = false;
+
     if (e->getBase()->getType()->isVectorType()) {
       assert(!cir::MissingFeatures::scalableVectors());
 
@@ -839,6 +845,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   BinOpInfo emitBinOps(const BinaryOperator *e,
                        QualType promotionType = QualType()) {
+    ignoreResultAssign = false;
     BinOpInfo result;
     result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType);
     result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType);
@@ -924,6 +931,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 #undef HANDLEBINOP
 
   mlir::Value emitCmp(const BinaryOperator *e) {
+    ignoreResultAssign = false;
     const mlir::Location loc = cgf.getLoc(e->getExprLoc());
     mlir::Value result;
     QualType lhsTy = e->getLHS()->getType();
@@ -1406,11 +1414,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) {
 }
 
 /// Emit the computation of the specified expression of scalar type.
-mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) {
+mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e,
+                                           bool ignoreResultAssign) {
   assert(e && hasScalarEvaluationKind(e->getType()) &&
          "Invalid scalar expression to emit");
 
-  return ScalarExprEmitter(*this, builder).Visit(const_cast<Expr *>(e));
+  return ScalarExprEmitter(*this, builder, ignoreResultAssign)
+      .Visit(const_cast<Expr *>(e));
 }
 
 mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e,
@@ -2054,6 +2064,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) {
 mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) {
   const unsigned numInitElements = e->getNumInits();
 
+  [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false);
+  assert((ignore == false ||
+          (numInitElements == 0 && e->getType()->isVoidType())) &&
+         "init list ignored");
+
   if (e->hadArrayRangeDesignator()) {
     cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator");
     return {};
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index e5cecaa573a6e..dece642eb13b6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1501,7 +1501,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                               llvm::ArrayRef<mlir::Value> args = {});
 
   /// Emit the computation of the specified expression of scalar type.
-  mlir::Value emitScalarExpr(const clang::Expr *e);
+  mlir::Value emitScalarExpr(const clang::Expr *e,
+                             bool ignoreResultAssign = false);
 
   mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
                                       cir::UnaryOpKind kind, bool isPre);
diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
index 44c54b4a2969a..4520063c56ee6 100644
--- a/clang/test/CIR/CodeGen/binassign.c
+++ b/clang/test/CIR/CodeGen/binassign.c
@@ -100,3 +100,107 @@ void binary_assign_struct() {
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false)
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true)
 // OGCG:   ret void
+
+int ignore_result_assign() {
+  int arr[10];
+  int i, j;
+  j = i = 123, 0;
+  j = arr[i = 5];
+  int *p, *q = 0;
+  if(p = q)
+    return 1;
+  return 0;
+}
+
+// CIR-LABEL: cir.func{{.*}} @ignore_result_assign() -> !s32i
+// CIR:         %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR:         %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["arr"]
+// CIR:         %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i"]
+// CIR:         %[[J:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["j"]
+// CIR:         %[[P:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p"]
+// CIR:         %[[Q:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["q", init]
+// CIR:         %[[VAL_123:.*]] = cir.const #cir.int<123> : !s32i
+// CIR:         cir.store{{.*}} %[[VAL_123]], %[[I]] : !s32i, !cir.ptr<!s32i>
+// CIR:         cir.store{{.*}} %[[VAL_123]], %[[J]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[VAL_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         %[[VAL_5:.*]] = cir.const #cir.int<5> : !s32i
+// CIR:         cir.store{{.*}} %[[VAL_5]], %[[I]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>> -> !cir.ptr<!s32i>
+// CIR:         %[[ARR_ELEM:.*]] = cir.ptr_stride %[[ARR_DECAY]], %[[VAL_5]] : (!cir.ptr<!s32i>, !s32i) -> !cir.ptr<!s32i>
+// CIR:         %[[ARR_LOAD:.*]] = cir.load{{.*}} %[[ARR_ELEM]] : !cir.ptr<!s32i>, !s32i
+// CIR:         cir.store{{.*}} %[[ARR_LOAD]], %[[J]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR:         cir.store{{.*}} %[[NULL]], %[[Q]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:         cir.scope {
+// CIR:           %[[Q_VAL:.*]] = cir.load{{.*}} %[[Q]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR:           cir.store{{.*}} %[[Q_VAL]], %[[P]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:           %[[COND:.*]] = cir.cast ptr_to_bool %[[Q_VAL]] : !cir.ptr<!s32i> -> !cir.bool
+// CIR:           cir.if %[[COND]] {
+// CIR:             %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:             cir.store %[[ONE]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:             %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR:             cir.return
+// CIR:           }
+// CIR:         }
+// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         cir.store %[[ZERO]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR:         cir.return
+
+// LLVM-LABEL: define {{.*}}i32 @ignore_result_assign()
+// LLVM:         %[[RETVAL_PTR:.*]] = alloca i32
+// LLVM:         %[[ARR_PTR:.*]] = alloca [10 x i32]
+// LLVM:         %[[I_PTR:.*]] = alloca i32
+// LLVM:         %[[J_PTR:.*]] = alloca i32
+// LLVM:         %[[P_PTR:.*]] = alloca ptr
+// LLVM:         %[[Q_PTR:.*]] = alloca ptr
+// LLVM:         store i32 123, ptr %[[I_PTR]]
+// LLVM:         store i32 123, ptr %[[J_PTR]]
+// LLVM:         store i32 5, ptr %[[I_PTR]]
+// LLVM:         %[[GEP1:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// LLVM:         %[[GEP2:.*]] = getelementptr i32, ptr %[[GEP1]], i64 5
+// LLVM:         %[[ARR_VAL:.*]] = load i32, ptr %[[GEP2]]
+// LLVM:         store i32 %[[ARR_VAL]], ptr %[[J_PTR]]
+// LLVM:         store ptr null, ptr %[[Q_PTR]]
+// LLVM:         br label
+// LLVM:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q_PTR]]
+// LLVM:         store ptr %[[Q_VAL]], ptr %[[P_PTR]]
+// LLVM:         %[[CMP:.*]] = icmp ne ptr %[[Q_VAL]], null
+// LLVM:         br i1 %[[CMP]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// LLVM:       [[THEN]]:
+// LLVM:         store i32 1, ptr %[[RETVAL_PTR]]
+// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
+// LLVM:         ret i32
+// LLVM:       [[ELSE]]:
+// LLVM:         br label
+// LLVM:         store i32 0, ptr %[[RETVAL_PTR]]
+// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
+// LLVM:         ret i32
+
+// OGCG-LABEL: define {{.*}}i32 @ignore_result_assign()
+// OGCG:         %[[RETVAL:.*]] = alloca i32
+// OGCG:         %[[ARR:.*]] = alloca [10 x i32]
+// OGCG:         %[[I:.*]] = alloca i32
+// OGCG:         %[[J:.*]] = alloca i32
+// OGCG:         %[[P:.*]] = alloca ptr
+// OGCG:         %[[Q:.*]] = alloca ptr
+// OGCG:         store i32 123, ptr %[[I]]
+// OGCG:         store i32 123, ptr %[[J]]
+// OGCG:         store i32 5, ptr %[[I]]
+// OGCG:         %[[ARRAYIDX:.*]] = getelementptr inbounds [10 x i32], ptr %[[ARR]], i64 0, i64 5
+// OGCG:         %[[ARR_VAL:.*]] = load i32, ptr %[[ARRAYIDX]]
+// OGCG:         store i32 %[[ARR_VAL]], ptr %[[J]]
+// OGCG:         store ptr null, ptr %[[Q]]
+// OGCG:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q]]
+// OGCG:         store ptr %[[Q_VAL]], ptr %[[P]]
+// OGCG:         %[[TOBOOL:.*]] = icmp ne ptr %[[Q_VAL]], null
+// OGCG:         br i1 %[[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// OGCG:       [[IF_THEN]]:
+// OGCG:         store i32 1, ptr %[[RETVAL]]
+// OGCG:         br label %[[RETURN:.*]]
+// OGCG:       [[IF_END]]:
+// OGCG:         store i32 0, ptr %[[RETVAL]]
+// OGCG:         br label %[[RETURN]]
+// OGCG:       [[RETURN]]:
+// OGCG:         %{{.*}} = load i32, ptr %[[RETVAL]]
+// OGCG:         ret i32

From f291f335c9628ea8d855fcc7c246171d70ceff58 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel@intel.com>
Date: Tue, 4 Nov 2025 19:37:08 -0800
Subject: [PATCH 272/313] [MLIR][XeGPU] Support order attribute and add pattern
 for vector.transpose in WgToSg Pass (#165307)

This PR does the following:
1. Handle order attribute during the delinearization from linear
subgroup Id to multi-dim id.
2. Adds a transformation pattern for vector.transpose in wg to sg pass.
3. Updates CHECKS in the wg to sg tests
---
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  85 ++++++++--
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  81 ++++++++--
 .../Dialect/XeGPU/subgroup-distribute.mlir    |  42 ++---
 .../Dialect/XeGPU/xegpu-attr-interface.mlir   |  39 +++--
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        |   6 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  88 +++++------
 .../XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir    |  64 ++++----
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       | 145 ++++++++++--------
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 123 +++++++--------
 9 files changed, 387 insertions(+), 286 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 397107b786c9e..fb5d1e758dbd1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -280,27 +280,82 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 FailureOr<SmallVector<Value>>
 LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
 
-  // TODO: handle order attribute
-  auto hasDefaultOrder = [&]() {
-    DenseI32ArrayAttr order = getOrder();
-    return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>(
-                         llvm::reverse(order.asArrayRef())));
-  };
-  if (!hasDefaultOrder())
-    return mlir::emitError(loc, "order attribute is currently not supported.");
-  SmallVector<int64_t> layout;
+  SmallVector<int64_t> sgLayoutInt;
   if (isForWorkgroup()) {
-    layout = getEffectiveSgLayoutAsInt();
+    sgLayoutInt = getEffectiveSgLayoutAsInt();
   } else if (isForSubgroup()) {
-    layout = getEffectiveLaneLayoutAsInt();
+    sgLayoutInt = getEffectiveLaneLayoutAsInt();
   } else {
     return failure();
   }
-  auto dims = llvm::map_to_vector(layout, [&](int64_t d) -> Value {
-    return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
-  });
 
-  return affine::delinearizeIndex(builder, loc, linearId, dims);
+  DenseI32ArrayAttr orderAttr = getOrder();
+
+  // Handle order attribute
+  SmallVector<int64_t> order;
+  if (orderAttr && !orderAttr.empty()) {
+    order = llvm::to_vector(
+        llvm::map_range(orderAttr.asArrayRef(),
+                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
+  } else {
+    // Default order: [1, 0] for 2D (row-major), [2, 1, 0] for 3D, etc.
+    order = llvm::to_vector(
+        llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size())));
+  }
+
+  if (order.size() != sgLayoutInt.size()) {
+    return failure();
+  }
+
+  SmallVector<Value> result(sgLayoutInt.size());
+  Value remaining = linearId;
+
+  /// Process dimensions in the order they appear in the order array
+  /// The first dimension in order is the fastest-changing
+  ///
+  /// Example walkthrough for linearId=22, sgLayout=[2,4,4], order=[2,1,0]:
+  ///
+  /// Initial: remaining=22, dimIdx = order[i], dimSize = sgLayout[dimIdx],
+  /// result=[?,?,?]
+  ///
+  /// i=0 (process columns, dimIdx=2, dimSize=4):
+  ///   result[2] = 22 % 4 = 2  (column coordinate)
+  ///   remaining = 22 / 4 = 5  (5 complete groups of 4 columns processed)
+  ///
+  /// i=1 (process rows, dimIdx=1, dimSize=4):
+  ///   result[1] = 5 % 4 = 1   (row coordinate)
+  ///   remaining = 5 / 4 = 1   (1 complete group of 4 rows processed)
+  ///
+  /// i=2 (process layers, dimIdx=0, dimSize=2):
+  ///   result[0] = 1 % 2 = 1   (layer coordinate)
+  ///   (no remaining update - last iteration)
+  ///
+  /// Final result: [1,1,2] = Layer 1, Row 1, Column 2
+  for (size_t i = 0; i < order.size(); ++i) {
+    int64_t dimIdx = order[i];
+    int64_t dimSize = sgLayoutInt[dimIdx];
+
+    Value dimSizeVal =
+        builder.createOrFold<arith::ConstantIndexOp>(loc, dimSize);
+
+    /// Extract the coordinate for this dimension using modulo operation
+    /// This gives us "how far within this dimension" we are
+    /// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within
+    /// this dimension)
+    result[dimIdx] =
+        builder.createOrFold<index::RemUOp>(loc, remaining, dimSizeVal);
+
+    /// Update remaining for the next dimension by removing what we've already
+    /// processed. Division tells us "how many complete groups of this dimension
+    /// we've gone through" e.g., linearId=22, dimSize=4: 22 / 4 = 5 (we've
+    /// completed 5 groups of 4) Skip this for the last iteration since there's
+    /// no next dimension to process
+    if (i < order.size() - 1) {
+      remaining =
+          builder.createOrFold<index::DivUOp>(loc, remaining, dimSizeVal);
+    }
+  }
+  return result;
 }
 
 /// Implements DistributeLayoutAttr::computeDistributedCoords to generate
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index d12a04df5c46c..0a9ef0aa6df96 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1219,6 +1219,70 @@ struct WgToSgMultiDimReductionOp
   }
 };
 
+// This pattern transforms vector.transpose ops to work at subgroup level.
+struct WgToSgVectorTransposeOp
+    : public OpConversionPattern<vector::TransposeOp> {
+  using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType resultType = op.getResultVectorType();
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
+    if (!layout || !layout.isForWorkgroup())
+      return failure();
+
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(op.getVector());
+    if (!sourceLayout || !sourceLayout.isForWorkgroup())
+      return failure();
+
+    SmallVector<int64_t> sourceSgLayout =
+        sourceLayout.getEffectiveSgLayoutAsInt();
+    SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
+    DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
+    DenseI32ArrayAttr resultOrder = layout.getOrder();
+
+    if (!sourceOrder || !resultOrder) {
+      return rewriter.notifyMatchFailure(
+          op, "Both source and result must have order attributes");
+    }
+
+    ArrayRef<int64_t> permutation = op.getPermutation();
+    size_t permutationSize = permutation.size();
+    if (sourceSgLayout.size() != permutationSize ||
+        resultSgLayout.size() != permutationSize) {
+      return rewriter.notifyMatchFailure(
+          op, "Layouts and permutation must have the same rank");
+    }
+
+    // Check that sgLayout, sgData & order are properly transposed for source
+    // and result
+    if (!layout.isTransposeOf(sourceLayout, permutation))
+      return rewriter.notifyMatchFailure(
+          op, "Result layout is not a valid transpose of source layout "
+              "according to permutation");
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+    SmallVector<Value> newTransposeOps;
+    for (auto src : adaptor.getVector()) {
+      auto newTranspose = vector::TransposeOp::create(
+          rewriter, op.getLoc(), newResultType, src, permutation);
+      xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
+                                     layout.dropSgLayoutAndData());
+      newTransposeOps.push_back(newTranspose.getResult());
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newTransposeOps});
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir {
@@ -1233,7 +1297,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
            WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
            WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
            WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp,
-           WgToSgMultiDimReductionOp>(patterns.getContext());
+           WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp>(
+          patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -1360,7 +1425,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>(
+  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
+                               vector::TransposeOp, vector::BroadcastOp,
+                               vector::MultiDimReductionOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
         auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
@@ -1379,16 +1446,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::BroadcastOp>(
-      [=](vector::BroadcastOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [=](vector::MultiDimReductionOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
   target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
       [=](xegpu::ConvertLayoutOp op) -> bool {
         return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 8946d14e80b72..8fd3cca5594cb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -268,15 +268,16 @@ gpu.module @xevm_module{
 
 // -----
 // CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
-// CHECK: %[[LAYOUT_X:.*]] = arith.constant 8 : index
-// CHECK: %[[LAYOUT_Y:.*]] = arith.constant 2 : index
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
 // CHECK: %[[LANE_ID:.*]] = gpu.lane_id
-// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
-// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
-// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_Y]], %[[LAYOUT_Y]]
-// CHECK: %[[LANE_X_OFFSET:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[LAYOUT_X]]
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]]
+// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]]
+// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]]
+// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]]
+// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
 gpu.module @xevm_module{
   gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
     %c0 = arith.constant 0 : index
@@ -288,19 +289,20 @@ gpu.module @xevm_module{
 
 // -----
 // CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
-// CHECK: %[[DIST_UNIT_HEIGHT_X:.*]] = arith.constant 4 : index
-// CHECK: %[[DIST_UNIT_HEIGHT_Y:.*]] = arith.constant 8 : index
-// CHECK: %[[LANE_DATA_Y:.*]] = arith.constant 2 : index
-// CHECK: %[[USER_OFFSET_X:.*]] = arith.constant 1 : index
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[C4:.*]] = arith.constant 4 : index
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
 // CHECK: %[[LANE_ID:.*]] = gpu.lane_id
-// CHECK: %[[DELINEARIZED_LANE_Y:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
-// CHECK: %[[DELINEARIZED_LANE_X:.*]] = affine.apply #{{.*}}()[%[[LANE_ID]]]
-// CHECK: %[[LANE_Y_OFFSET_1:.*]] = index.mul %[[DELINEARIZED_LANE_Y]], %[[LANE_DATA_Y]]
-// CHECK: %[[LANE_Y_OFFSET:.*]] = index.remu %[[LANE_Y_OFFSET_1]], %[[DIST_UNIT_HEIGHT_Y]]
-// CHECK: %[[LANE_X_OFFSET_1:.*]] = index.remu %[[DELINEARIZED_LANE_X]], %[[DIST_UNIT_HEIGHT_X]]
-// CHECK: %[[LANE_X_OFFSET:.*]] = index.add %[[LANE_X_OFFSET_1]], %[[USER_OFFSET_X]]
-// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
-// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[LANE_Y_OFFSET]], %[[LANE_X_OFFSET]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]]
+// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]]
+// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]]
+// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]]
+// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]]
+// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]]
+// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
 gpu.module @xevm_module{
   gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
     %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index b73bc69393dab..02c5f71d5c83d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -1,33 +1,32 @@
 // RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
 
-//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test {
   gpu.func @slice_attr() -> vector<128xindex> {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
-    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]]
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
+    // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
+    // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
+    // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return %step : vector<128xindex>
   }
 
   gpu.func @nested_slice_attr() -> vector<128xindex> {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
-    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]]
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
+    // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
+    // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
+    // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
     %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
     gpu.return %0 : vector<128xindex>
   }
 
-}
\ No newline at end of file
+}
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 09df1e4da43e2..9580769d37313 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops {
     %load_b = xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: arith.negf
     %negf = arith.negf %load_a
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
-    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: math.powf
     %powf = math.powf %load_a, %load_b
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index d2d250cbe0f66..01134d8eaabec 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,14 +1,10 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-#map = affine_map<()[s0] -> (s0 floordiv 4)>
-#map1 = affine_map<()[s0] -> (s0 mod 4)>
-
 gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
-      // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -16,22 +12,23 @@ gpu.module @test_round_robin_assignment {
     }
 
   // CHECK-LABEL: create_nd_tdesc_with_shared_data
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]]
-    //CHECK: [[C16:%.+]] = arith.constant 16 : index
-    //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]]
-    //CHECK: [[C64:%.+]] = arith.constant 64 : index
-    //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
-    //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
-    //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
-    //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
+    // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK: %[[C4:.*]] = arith.constant 4 : index
+    // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]]
+    // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]]
+    // CHECK: %[[C8:.*]] = arith.constant 8 : index
+    // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]]
+    // CHECK: %[[C16:.*]] = arith.constant 16 : index
+    // CHECK: %[[LY:.*]] = index.mul %[[IDY]], %[[C16]]
+    // CHECK: %[[C64:.*]] = arith.constant 64 : index
+    // CHECK: %[[LX:.*]] = index.mul %[[IDX]], %[[C64]]
+    // CHECK: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK: %[[OFFY:.*]] = index.remu %[[LY]], %[[C128]]
+    // CHECK: %[[C64_1:.*]] = arith.constant 64 : index
+    // CHECK: %[[OFFX:.*]] = index.remu %[[LX]], %[[C64_1]]
+    // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
     gpu.return
@@ -42,9 +39,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.load_nd %{{.*}}
-      // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+      // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
       // CHECK-NOT: xegpu.load_nd
       %load =  xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -57,9 +52,8 @@ gpu.module @test_round_robin_assignment {
   gpu.func @store_nd(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
-      // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-NOT : xegpu.store_nd
+      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-NOT: xegpu.store_nd
       %load = xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
         -> vector<256x128xf32>
@@ -73,8 +67,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @update_nd(%src: memref<256x128xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       ->  !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>>
+    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.update_nd_offset
     %update = xegpu.update_nd_offset %tdesc, [0, 16]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -84,15 +77,9 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -113,8 +100,7 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -131,9 +117,7 @@ gpu.module @test_round_robin_assignment {
     %load =  xegpu.load_nd %tdesc
       : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
       -> vector<128x1xf32>
-    // CHECK-COUNT-2: vector.broadcast {{.*}}
-    // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32>
+    // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.broadcast
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
@@ -171,10 +155,10 @@ gpu.module @test_round_robin_assignment {
     %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
+    // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
     %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
       %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
-      //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
+      // CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
       scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
     } do {
     // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
@@ -195,16 +179,16 @@ gpu.module @test_round_robin_assignment {
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     %3 = arith.cmpi eq, %0, %c10 : index
     // CHECK-LABEL: scf.if
-    //  CHECK-SAME: (vector<16xf32>, vector<16xf32>)
+    // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
     %4 = scf.if %3 -> (vector<256xf32>) {
       %5 = xegpu.load_nd %1  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } else {
       %5 = xegpu.load_nd %2  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
     xegpu.store_nd %4, %1  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -220,16 +204,16 @@ gpu.module @test_round_robin_assignment {
 
     %0 = arith.cmpi eq, %id, %c10 : index
     // CHECK-LABEL: scf.if
-    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
     %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
       %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
       scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     } else {
       %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
       scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     }
     xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -238,8 +222,8 @@ gpu.module @test_round_robin_assignment {
 
   gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
     %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
-    //CHECK-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
-    //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
+    // CHECK-COUNT-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
+    // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
     %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index 86a021b66949c..84ce80f477a55 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -14,13 +14,11 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.load_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load = xegpu.load_nd %tdesc[0, 0]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     gpu.return
@@ -28,8 +26,7 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: store_nd_with_offset
   gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.store_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -42,10 +39,8 @@ gpu.module @test_distribution {
   }
 
   // CHECK-LABEL: prefetch_nd_tdesc_with_offset
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -57,15 +52,11 @@ gpu.module @test_distribution {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -102,27 +93,42 @@ gpu.module @test_distribution {
   gpu.func @non_splat_constant() {
     // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[MAP4:.*]] = affine.apply #map4()[%[[SGID]]]
-    // CHECK-DAG: %[[MAP5:.*]] = affine.apply #map5()[%[[SGID]]]
-    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[MAP4]], %[[C2:.*]]
-    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[MUL]], %[[C32:.*]]
-    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
+    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2:.*]]
+    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C1:.*]]
     // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index
-    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
-    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
-    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU1]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU5:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU6:.*]] = index.remu %[[REMU1]], %[[C1:.*]]
+    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index
-    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU2]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index
     // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex>
-    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU5]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index
-    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU6]], %[[C0:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index
     // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex>
     %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: vector_transpose
+  gpu.func @vector_transpose(%src: memref<256x128xf32>) {
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
+        -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+    %load = xegpu.load_nd %tdesc[0, 0]
+        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> vector<256x128xf32>
+    // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
+    // CHECK-NOT: vector.transpose
+    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
+      gpu.return
+  }
 }
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 8d98fcfd0d2c2..4fbb566cfbe73 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -1,8 +1,5 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
-//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
-//CHECK: #map2 = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test_distribution {
   // CHECK-LABEL: create_nd_tdesc_no_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -26,13 +23,23 @@ gpu.module @test_distribution {
   }
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: %[[LOAD:.*]] = xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
-    %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
+    //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+    //CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4]]
+    //CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4]]
+    //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+    //CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8]]
+    //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+    //CHECK-DAG: %[[L_OFF_Y:.*]] = index.mul %[[SGIDY]], %[[C32]]
+    //CHECK-DAG: %[[L_OFF_X:.*]] = index.mul %[[SGIDX]], %[[C32]]
+    //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index
+    //CHECK-DAG: %[[OFF_Y:.*]] = index.remu %[[L_OFF_Y]], %[[C256]]
+    //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
+    //CHECK-DAG: %[[OFF_X:.*]] = index.remu %[[L_OFF_X]], %[[C128]]
+    //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load =  xegpu.load_nd %tdesc[0, 0]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -43,9 +50,6 @@ gpu.module @test_distribution {
   // CHECK-LABEL: store_nd_with_offsets
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
     //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]  : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -60,9 +64,6 @@ gpu.module @test_distribution {
   // CHECK-LABEL: prefetch_nd_tdesc_with_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
     //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %cst0 = arith.constant 0 : index
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
@@ -319,17 +320,15 @@ gpu.module @test_distribution {
   gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) {
     //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[c2:%.+]] = arith.constant 2 : index
     //CHECK: [[c4:%.+]] = arith.constant 4 : index
-    //CHECK: [[c4_0:%.+]] = arith.constant 4 : index
-    //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
-    //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]]
+    //CHECK: [[c2:%.+]] = arith.constant 2 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
-    //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c32_0:%.+]] = arith.constant 32 : index
+    //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]]
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -346,17 +345,15 @@ gpu.module @test_distribution {
     //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32>
     //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[c2:%.+]] = arith.constant 2 : index
     //CHECK: [[c4:%.+]] = arith.constant 4 : index
-    //CHECK: [[c4_0:%.+]] = arith.constant 4 : index
-    //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
-    //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]]
+    //CHECK: [[c2:%.+]] = arith.constant 2 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
-    //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c0_2:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c32_0:%.+]] = arith.constant 32 : index
+    //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]]
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -411,14 +408,17 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_step_op
   gpu.func @vector_step_op_slice_attr() {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
-    //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
-    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c8]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgId]], [[c8]]
+    //CHECK: [[c4:%.+]] = arith.constant 4 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c4]]
+    //CHECK: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LY:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return
@@ -426,14 +426,14 @@ gpu.module @test_distribution {
 
   gpu.func @vector_step_op_layout_attr() {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
-    //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
-    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
-    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
-    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
+    //CHECK: [[c16:%.+]] = arith.constant 16 : index
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c16]]
+    //CHECK: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK: [[LOCALY:%.+]] = index.mul [[sgidx]], [[c8]]
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
     %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
     gpu.return
@@ -464,14 +464,27 @@ gpu.module @test_distribution {
     gpu.return
   }
 
+  // CHECK-LABEL: vector_transpose
+  gpu.func @vector_transpose(%src: memref<256x32xf32>) {
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
+        -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+    %load = xegpu.load_nd %tdesc[0, 0]
+        : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> vector<256x32xf32>
+    //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
+    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32>
+      gpu.return
+  }
+
   // CHECK-LABEL: non_splat_constant_2D
   gpu.func @non_splat_constant_2D() {
     // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: affine.apply #map4()[%[[SGID]]]
-    // CHECK-DAG: affine.apply #map5()[%[[SGID]]]
-    // CHECK-DAG: %[[IDY:.*]] = index.remu %{{.*}}, %[[C32:.*]]
-    // CHECK-DAG: %[[IDX:.*]] = index.remu %{{.*}}, %[[C1:.*]]
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}}
+    // CHECK-DAG: %[[IDY:.*]] = index.remu %[[SGIDY]], %{{.*}}
+    // CHECK-DAG: %[[IDX:.*]] = index.remu %[[SGIDX]], %{{.*}}
     // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index
     // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index
@@ -484,20 +497,19 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: non_splat_constant_2D_non_unit_dim
   gpu.func @non_splat_constant_2D_non_unit_dim() {
-    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}} : vector<2x2xindex>
+    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[IDY:.*]] = affine.apply #map()[%[[SGID]]]
-    // CHECK-DAG: %[[IDX:.*]] = affine.apply #map1()[%[[SGID]]]
-    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[IDY]], %[[C2:.*]]
-    // CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index
-    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[IDX]], %[[C2:.*]]
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}}
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[SGIDY]], %[[C2:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[SGIDX]], %{{.*}}
     // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]]
-    // CHECK-DAG: %[[C8_2:.*]] = arith.constant 8 : index
-    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %[[C8:.*]]
-    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %[[C8:.*]] : index
+    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %{{.*}}
+    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index
     // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index
-    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi  %[[ADD]], %[[MUL6]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex>
     // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex>
     %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[
@@ -517,13 +529,14 @@ gpu.module @test_distribution {
   gpu.func @non_splat_constant() {
     // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C32:.*]]
-    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[REMU]], %{{.*}}
+    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex>
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex>
     %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
-    // CHECK: arith.constant dense<{{\[}}[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]{{\]}}> : vector<1x16xindex>
+    // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex>
     %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index e83229e3a3995..5ce3d1d0fb5d6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -1,47 +1,35 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
-//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: create_nd_tdesc
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: [[C32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
-    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]]
+    // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
 
   // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32>
   gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: [[C32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
-    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_2:%.+]] = arith.constant 0 : index
-    //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
-    //CHECK: [[C0_3:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_4:%.+]] = arith.constant 0 : index
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]]
+    // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
@@ -81,25 +69,24 @@ gpu.module @test_1_1_assignment {
     xegpu.store_nd %load, %tdesc
       : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
-}
+  }
 
-// CHECK-LABEL: update_nd
-// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
-gpu.func @update_nd(%src: memref<256x128xf32>){
-  // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
-  // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
-  // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-    -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  %update = xegpu.update_nd_offset %tdesc, [0, 16]
-    : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
+  // CHECK-LABEL: update_nd
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
+  gpu.func @update_nd(%src: memref<256x128xf32>){
+    // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
+    // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
+    // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    %update = xegpu.update_nd_offset %tdesc, [0, 16]
+      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
 
-// CHECK-LABEL: dpas
-gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+  // CHECK-LABEL: dpas
+  gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load_a =  xegpu.load_nd %tdesc_a
@@ -110,16 +97,15 @@ gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %load_b =  xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
 
-
-// CHECK-LABEL: dpas_no_sg_data
-gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+  // CHECK-LABEL: dpas_no_sg_data
+  gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
@@ -134,6 +120,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
@@ -196,9 +183,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
   }
 
   gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
+    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
     %c0 = arith.constant 0 : index
     %c128 = arith.constant 128 : index
     %c1024 = arith.constant 1024 : index
@@ -211,15 +198,15 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
     %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
 
-    //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
-    // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
+    // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
+    // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
     // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
-    //      CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
-    //      CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
-    //      CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
-    //      CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
-    //      CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
-    //      CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
+    // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+    // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+    // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
+    // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
+    // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
+    // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
         -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
             !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
@@ -252,7 +239,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
       // CHECK: scf.condition{{.*}} : vector<16xf32>, i32
       scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
     } do {
-    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32)
+    // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32)
     ^bb0(%arg2: vector<256xf32>, %arg3: i32):
       xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
       %4 = arith.addi %arg3, %c1_i32 : i32
@@ -344,9 +331,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %cond4 = arith.cmpi slt, %sg_id, %c31 : index
     %cond5 = arith.andi %cond3, %cond4 : i1
     scf.if %cond5 {
-      // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
-      // CHECK: %[[C2:.*]] = arith.constant 2 : index
-      // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
+        // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+        // CHECK: %[[C2:.*]] = arith.constant 2 : index
+        // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
       %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32>
         -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
       %load =  xegpu.load_nd %tdesc

From c5aaee0bb07b221e5d3314bbdcf1abc4a604d6bd Mon Sep 17 00:00:00 2001
From: Kewen Meng <Kewen.Meng@amd.com>
Date: Tue, 4 Nov 2025 19:43:16 -0800
Subject: [PATCH 273/313] Revert commit d8e5698 and 15b19c7 (#166498)

---
 libc/src/stdio/CMakeLists.txt                 | 24 ---------
 libc/src/stdio/asprintf.cpp                   | 18 +------
 libc/src/stdio/baremetal/CMakeLists.txt       |  8 ---
 libc/src/stdio/baremetal/printf.cpp           | 23 ++------
 libc/src/stdio/baremetal/vprintf.cpp          | 23 ++------
 libc/src/stdio/generic/CMakeLists.txt         |  4 --
 libc/src/stdio/generic/fprintf.cpp            | 17 +-----
 libc/src/stdio/generic/printf.cpp             | 17 +-----
 libc/src/stdio/generic/vfprintf.cpp           | 17 +-----
 libc/src/stdio/generic/vprintf.cpp            | 17 +-----
 libc/src/stdio/printf_core/CMakeLists.txt     | 25 ---------
 libc/src/stdio/printf_core/core_structs.h     | 19 +++----
 libc/src/stdio/printf_core/error_mapper.h     | 21 --------
 .../stdio/printf_core/generic/CMakeLists.txt  |  8 ---
 .../stdio/printf_core/generic/error_mapper.h  | 49 -----------------
 .../stdio/printf_core/linux/CMakeLists.txt    |  8 ---
 .../stdio/printf_core/linux/error_mapper.h    | 54 -------------------
 libc/src/stdio/printf_core/printf_main.h      |  9 ++--
 .../stdio/printf_core/vasprintf_internal.h    | 20 ++++---
 .../src/stdio/printf_core/vfprintf_internal.h | 41 +++++---------
 .../stdio/printf_core/write_int_converter.h   |  4 +-
 libc/src/stdio/printf_core/writer.h           |  8 +--
 libc/src/stdio/snprintf.cpp                   | 19 +------
 libc/src/stdio/sprintf.cpp                    | 18 +------
 libc/src/stdio/vasprintf.cpp                  | 16 +-----
 libc/src/stdio/vsnprintf.cpp                  | 19 +------
 libc/src/stdio/vsprintf.cpp                   | 17 +-----
 libc/src/stdlib/CMakeLists.txt                |  6 ---
 libc/src/stdlib/strfromd.cpp                  | 11 +---
 libc/src/stdlib/strfromf.cpp                  | 11 +---
 libc/src/stdlib/strfroml.cpp                  | 11 +---
 libc/src/time/strftime_core/strftime_main.h   |  3 +-
 libc/test/src/stdio/CMakeLists.txt            |  2 -
 libc/test/src/stdio/fprintf_test.cpp          | 24 ---------
 .../src/stdio/printf_core/converter_test.cpp  | 30 +++++------
 .../src/stdio/printf_core/writer_test.cpp     | 32 +++++------
 libc/test/src/stdio/snprintf_test.cpp         | 15 ------
 libc/test/src/stdio/vfprintf_test.cpp         |  5 --
 libc/test/src/stdlib/StrfromTest.h            | 19 +------
 39 files changed, 104 insertions(+), 588 deletions(-)
 delete mode 100644 libc/src/stdio/printf_core/error_mapper.h
 delete mode 100644 libc/src/stdio/printf_core/generic/CMakeLists.txt
 delete mode 100644 libc/src/stdio/printf_core/generic/error_mapper.h
 delete mode 100644 libc/src/stdio/printf_core/linux/CMakeLists.txt
 delete mode 100644 libc/src/stdio/printf_core/linux/error_mapper.h

diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index c75c8b11be2b5..b0a6ef1e291b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -125,10 +125,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -140,10 +136,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -154,10 +146,6 @@ add_entrypoint_object(
     asprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -169,10 +157,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -184,10 +168,6 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -198,10 +178,6 @@ add_entrypoint_object(
     vasprintf.h
   DEPENDS
     libc.src.stdio.printf_core.vasprintf_internal
-    libc.src.stdio.printf_core.core_structs
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_subdirectory(printf_core)
diff --git a/libc/src/stdio/asprintf.cpp b/libc/src/stdio/asprintf.cpp
index 0991dfca6a059..f8cfb74ce48ea 100644
--- a/libc/src/stdio/asprintf.cpp
+++ b/libc/src/stdio/asprintf.cpp
@@ -7,12 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/asprintf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -26,18 +22,8 @@ LLVM_LIBC_FUNCTION(int, asprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  auto ret_val = printf_core::vasprintf_internal(buffer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  int ret = printf_core::vasprintf_internal(buffer, format, args);
+  return ret;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
index bfeff0e2b5880..548938f885c94 100644
--- a/libc/src/stdio/baremetal/CMakeLists.txt
+++ b/libc/src/stdio/baremetal/CMakeLists.txt
@@ -29,12 +29,8 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
@@ -91,12 +87,8 @@ add_entrypoint_object(
   DEPENDS
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
-    libc.src.stdio.printf_core.error_mapper
-    libc.src.stdio.printf_core.core_structs
     libc.src.__support.arg_list
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.libc_errno
-    libc.src.__support.CPP.limits
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index 5a9b19ff20471..7253c6549a4e4 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -7,13 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/printf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -45,25 +42,13 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  auto retval = printf_core::printf_main(&writer, format, args);
-  if (!retval.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(retval.error());
-    return -1;
-  }
+  int retval = printf_core::printf_main(&writer, format, args);
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK) {
-    libc_errno = printf_core::internal_error_to_errno(-flushval);
-    return -1;
-  }
+  if (flushval != printf_core::WRITE_OK)
+    retval = flushval;
 
-  if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(retval.value());
+  return retval;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index c172b368d15f3..ab02533f14911 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -7,13 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vprintf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/io.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -43,25 +40,13 @@ LLVM_LIBC_FUNCTION(int, vprintf,
       buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
-  auto retval = printf_core::printf_main(&writer, format, args);
-  if (!retval.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(retval.error());
-    return -1;
-  }
+  int retval = printf_core::printf_main(&writer, format, args);
 
   int flushval = wb.overflow_write("");
-  if (flushval != printf_core::WRITE_OK) {
-    libc_errno = printf_core::internal_error_to_errno(-flushval);
-    return -1;
-  }
+  if (flushval != printf_core::WRITE_OK)
+    retval = flushval;
 
-  if (retval.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(retval.value());
+  return retval;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index 71055edea3d9e..6361822b61999 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -393,11 +393,7 @@ add_generic_entrypoint_object(
 list(APPEND fprintf_deps
       libc.hdr.types.FILE
       libc.src.__support.arg_list
-      libc.src.__support.CPP.limits
-      libc.src.__support.libc_errno
       libc.src.stdio.printf_core.vfprintf_internal
-      libc.src.stdio.printf_core.core_structs
-      libc.src.stdio.printf_core.error_mapper
 )
 
 if(LLVM_LIBC_FULL_BUILD)
diff --git a/libc/src/stdio/generic/fprintf.cpp b/libc/src/stdio/generic/fprintf.cpp
index b2033901557a0..087aeadfc52c5 100644
--- a/libc/src/stdio/generic/fprintf.cpp
+++ b/libc/src/stdio/generic/fprintf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/fprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -30,18 +27,8 @@ LLVM_LIBC_FUNCTION(int, fprintf,
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  int ret_val = printf_core::vfprintf_internal(stream, format, args);
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/printf.cpp b/libc/src/stdio/generic/printf.cpp
index 8d159d5c70870..bb7c7c86f843f 100644
--- a/libc/src/stdio/generic/printf.cpp
+++ b/libc/src/stdio/generic/printf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/printf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -34,19 +31,9 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  auto ret_val = printf_core::vfprintf_internal(
+  int ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vfprintf.cpp b/libc/src/stdio/generic/vfprintf.cpp
index a26f082ed9347..01f4265f118a6 100644
--- a/libc/src/stdio/generic/vfprintf.cpp
+++ b/libc/src/stdio/generic/vfprintf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/vfprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -27,18 +24,8 @@ LLVM_LIBC_FUNCTION(int, vfprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  auto ret_val = printf_core::vfprintf_internal(stream, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  int ret_val = printf_core::vfprintf_internal(stream, format, args);
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/generic/vprintf.cpp b/libc/src/stdio/generic/vprintf.cpp
index ae2160219f2bb..08d71515646ed 100644
--- a/libc/src/stdio/generic/vprintf.cpp
+++ b/libc/src/stdio/generic/vprintf.cpp
@@ -8,12 +8,9 @@
 
 #include "src/stdio/vprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vfprintf_internal.h"
 
 #include "hdr/types/FILE.h"
@@ -32,19 +29,9 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  auto ret_val = printf_core::vfprintf_internal(
+  int ret_val = printf_core::vfprintf_internal(
       reinterpret_cast<::FILE *>(PRINTF_STDOUT), format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 624129b2b36e7..ee66145e60156 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -32,17 +32,6 @@ if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
-  add_subdirectory(${LIBC_TARGET_OS})
-else()
-  add_subdirectory(generic)
-endif()
-
-set(target_error_mapper libc.src.stdio.printf_core.${LIBC_TARGET_OS}.error_mapper)
-if(NOT TARGET ${target_error_mapper})
-    set(target_error_mapper libc.src.stdio.printf_core.generic.error_mapper)
-endif()
-
 add_header_library(
   printf_config
   HDRS
@@ -58,7 +47,6 @@ add_header_library(
     libc.include.inttypes
     libc.src.__support.CPP.string_view
     libc.src.__support.FPUtil.fp_bits
-    libc.hdr.errno_macros
 )
 
 add_header_library(
@@ -137,7 +125,6 @@ add_header_library(
     .writer
     .core_structs
     libc.src.__support.arg_list
-    libc.src.__support.error_or
 )
 
 add_header_library(
@@ -149,20 +136,10 @@ add_header_library(
     libc.hdr.func.free
     libc.hdr.func.realloc
     libc.src.__support.arg_list
-    libc.src.__support.error_or
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
 )
 
-add_header_library(
-  error_mapper
-  HDRS
-    error_mapper.h
-  DEPENDS
-    ${target_error_mapper}
-    libc.src.__support.macros.properties.architectures
-)
-
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
   # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
@@ -175,10 +152,8 @@ add_header_library(
     vfprintf_internal.h
   DEPENDS
     libc.src.__support.File.file
-    libc.src.__support.error_or
     libc.src.__support.arg_list
     libc.src.stdio.printf_core.printf_main
     libc.src.stdio.printf_core.writer
   ${use_system_file}
 )
-
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 0d41f2244d8da..e27f77b6b594a 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -132,17 +132,14 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
 
 // This is the value to be returned by conversions when no error has occurred.
 constexpr int WRITE_OK = 0;
-// These are the error return values used by the printf engine when an
-// error has occurred. They are all large negative, distinct values starting
-// from -1000 to not overlap with system errors.
-constexpr int FILE_WRITE_ERROR = -1001;
-constexpr int FILE_STATUS_ERROR = -1002;
-constexpr int NULLPTR_WRITE_ERROR = -1003;
-constexpr int INT_CONVERSION_ERROR = -1004;
-constexpr int FIXED_POINT_CONVERSION_ERROR = -1005;
-constexpr int ALLOCATION_ERROR = -1006;
-constexpr int OVERFLOW_ERROR = -1007;
-
+// These are the printf return values for when an error has occurred. They are
+// all negative, and should be distinct.
+constexpr int FILE_WRITE_ERROR = -1;
+constexpr int FILE_STATUS_ERROR = -2;
+constexpr int NULLPTR_WRITE_ERROR = -3;
+constexpr int INT_CONVERSION_ERROR = -4;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
+constexpr int ALLOCATION_ERROR = -6;
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/stdio/printf_core/error_mapper.h b/libc/src/stdio/printf_core/error_mapper.h
deleted file mode 100644
index 23030930133a1..0000000000000
--- a/libc/src/stdio/printf_core/error_mapper.h
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- Error mapper for printf ---------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
-#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
-
-#include "src/__support/macros/properties/architectures.h"
-
-// Maps internal errors to the available errnos on the platform.
-#if defined(__linux__)
-#include "linux/error_mapper.h"
-#else
-#include "generic/error_mapper.h"
-#endif
-
-#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/generic/CMakeLists.txt b/libc/src/stdio/printf_core/generic/CMakeLists.txt
deleted file mode 100644
index 2f0143d992e31..0000000000000
--- a/libc/src/stdio/printf_core/generic/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_header_library(
-  error_mapper
-  HDRS
-    error_mapper.h
-  DEPENDS
-    libc.src.stdio.printf_core.core_structs
-    libc.hdr.errno_macros
-)
diff --git a/libc/src/stdio/printf_core/generic/error_mapper.h b/libc/src/stdio/printf_core/generic/error_mapper.h
deleted file mode 100644
index d8cdd2cc2dbaa..0000000000000
--- a/libc/src/stdio/printf_core/generic/error_mapper.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- Generic implementation of error mapper ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
-#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
-
-#include "hdr/errno_macros.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace printf_core {
-
-LIBC_INLINE static int internal_error_to_errno(int internal_error) {
-  // System error occured, return error as is.
-  if (internal_error < 1001 && internal_error > 0) {
-    return internal_error;
-  }
-
-  // Map internal error to the available C standard errnos.
-  switch (-internal_error) {
-  case WRITE_OK:
-    return 0;
-  case FILE_WRITE_ERROR:
-  case FILE_STATUS_ERROR:
-  case NULLPTR_WRITE_ERROR:
-  case ALLOCATION_ERROR:
-    return EDOM;
-  case INT_CONVERSION_ERROR:
-  case FIXED_POINT_CONVERSION_ERROR:
-  case OVERFLOW_ERROR:
-    return ERANGE;
-  default:
-    LIBC_ASSERT(
-        false &&
-        "Invalid internal printf error code passed to internal_error_to_errno");
-    return EDOM;
-  }
-}
-
-} // namespace printf_core
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_GENERIC_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/linux/CMakeLists.txt b/libc/src/stdio/printf_core/linux/CMakeLists.txt
deleted file mode 100644
index 2f0143d992e31..0000000000000
--- a/libc/src/stdio/printf_core/linux/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_header_library(
-  error_mapper
-  HDRS
-    error_mapper.h
-  DEPENDS
-    libc.src.stdio.printf_core.core_structs
-    libc.hdr.errno_macros
-)
diff --git a/libc/src/stdio/printf_core/linux/error_mapper.h b/libc/src/stdio/printf_core/linux/error_mapper.h
deleted file mode 100644
index 3c2fe663072d0..0000000000000
--- a/libc/src/stdio/printf_core/linux/error_mapper.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- Linux implementation of error mapper --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
-#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
-
-#include "hdr/errno_macros.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace printf_core {
-
-LIBC_INLINE static int internal_error_to_errno(int internal_error) {
-  // System error occured, return error as is.
-  if (internal_error < 1001 && internal_error > 0) {
-    return internal_error;
-  }
-
-  // Map internal error to POSIX errnos.
-  switch (-internal_error) {
-  case WRITE_OK:
-    return 0;
-  case FILE_WRITE_ERROR:
-    return EIO;
-  case FILE_STATUS_ERROR:
-    return EIO;
-  case NULLPTR_WRITE_ERROR:
-    return EINVAL;
-  case INT_CONVERSION_ERROR:
-    return ERANGE;
-  case FIXED_POINT_CONVERSION_ERROR:
-    return EINVAL;
-  case ALLOCATION_ERROR:
-    return ENOMEM;
-  case OVERFLOW_ERROR:
-    return EOVERFLOW;
-  default:
-    LIBC_ASSERT(
-        false &&
-        "Invalid internal printf error code passed to internal_error_to_errno");
-    return EINVAL;
-  }
-}
-
-} // namespace printf_core
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_LINUX_ERROR_MAPPER_H
diff --git a/libc/src/stdio/printf_core/printf_main.h b/libc/src/stdio/printf_core/printf_main.h
index 1c7a7237c097d..57f29858d5298 100644
--- a/libc/src/stdio/printf_core/printf_main.h
+++ b/libc/src/stdio/printf_core/printf_main.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PRINTF_MAIN_H
 
 #include "src/__support/arg_list.h"
-#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -23,9 +22,8 @@ namespace LIBC_NAMESPACE_DECL {
 namespace printf_core {
 
 template <WriteMode write_mode>
-ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
-                            const char *__restrict str,
-                            internal::ArgList &args) {
+int printf_main(Writer<write_mode> *writer, const char *__restrict str,
+                internal::ArgList &args) {
   Parser<internal::ArgList> parser(str, args);
   int result = 0;
   for (FormatSection cur_section = parser.get_next_section();
@@ -35,8 +33,9 @@ ErrorOr<size_t> printf_main(Writer<write_mode> *writer,
       result = convert(writer, cur_section);
     else
       result = writer->write(cur_section.raw_string);
+
     if (result < 0)
-      return Error(-result);
+      return result;
   }
 
   return writer->get_chars_written();
diff --git a/libc/src/stdio/printf_core/vasprintf_internal.h b/libc/src/stdio/printf_core/vasprintf_internal.h
index 41df17b67f35b..283d8df2810fb 100644
--- a/libc/src/stdio/printf_core/vasprintf_internal.h
+++ b/libc/src/stdio/printf_core/vasprintf_internal.h
@@ -10,7 +10,6 @@
 #include "hdr/func/malloc.h"
 #include "hdr/func/realloc.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/error_or.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
@@ -30,7 +29,7 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
   if (new_buff == nullptr) {
     if (wb->buff != wb->init_buff)
       free(wb->buff);
-    return ALLOCATION_ERROR;
+    return printf_core::ALLOCATION_ERROR;
   }
   if (isBuffOnStack)
     inline_memcpy(new_buff, wb->buff, wb->buff_cur);
@@ -43,28 +42,27 @@ LIBC_INLINE int resize_overflow_hook(cpp::string_view new_str, void *target) {
 
 constexpr size_t DEFAULT_BUFFER_SIZE = 200;
 
-LIBC_INLINE ErrorOr<size_t> vasprintf_internal(char **ret,
-                                               const char *__restrict format,
-                                               internal::ArgList args) {
+LIBC_INLINE int vasprintf_internal(char **ret, const char *__restrict format,
+                                   internal::ArgList args) {
   char init_buff_on_stack[DEFAULT_BUFFER_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::RESIZE_AND_FILL_BUFF>::value> wb(
       init_buff_on_stack, DEFAULT_BUFFER_SIZE, resize_overflow_hook);
   printf_core::Writer writer(wb);
 
   auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
+  if (ret_val < 0) {
     *ret = nullptr;
-    return ret_val;
+    return -1;
   }
   if (wb.buff == init_buff_on_stack) {
-    *ret = static_cast<char *>(malloc(ret_val.value() + 1));
+    *ret = static_cast<char *>(malloc(ret_val + 1));
     if (ret == nullptr)
-      return Error(ALLOCATION_ERROR);
-    inline_memcpy(*ret, wb.buff, ret_val.value());
+      return printf_core::ALLOCATION_ERROR;
+    inline_memcpy(*ret, wb.buff, ret_val);
   } else {
     *ret = wb.buff;
   }
-  (*ret)[ret_val.value()] = '\0';
+  (*ret)[ret_val] = '\0';
   return ret_val;
 }
 } // namespace printf_core
diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h
index 564441d3bf51a..630de9d9d43dd 100644
--- a/libc/src/stdio/printf_core/vfprintf_internal.h
+++ b/libc/src/stdio/printf_core/vfprintf_internal.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/File/file.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/error_or.h"
 #include "src/__support/macros/attributes.h" // For LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/stdio/printf_core/core_structs.h"
@@ -36,8 +35,8 @@ LIBC_INLINE void funlockfile(FILE *f) {
   reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock();
 }
 
-LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
-                                         size_t nmemb, FILE *f) {
+LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
+                                   FILE *f) {
   return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->write_unlocked(
       ptr, size * nmemb);
 }
@@ -48,11 +47,9 @@ LIBC_INLINE void flockfile(::FILE *f) { ::flockfile(f); }
 
 LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); }
 
-LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size,
-                                         size_t nmemb, ::FILE *f) {
-  // Need to use system errno in this case, as system write will set this errno
-  // which we need to propagate back into our code.
-  return {::fwrite_unlocked(ptr, size, nmemb, f), errno};
+LIBC_INLINE size_t fwrite_unlocked(const void *ptr, size_t size, size_t nmemb,
+                                   ::FILE *f) {
+  return ::fwrite_unlocked(ptr, size, nmemb, f);
 }
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace internal
@@ -63,38 +60,26 @@ LIBC_INLINE int file_write_hook(cpp::string_view new_str, void *fp) {
   ::FILE *target_file = reinterpret_cast<::FILE *>(fp);
   // Write new_str to the target file. The logic preventing a zero-length write
   // is in the writer, so we don't check here.
-  auto write_result = internal::fwrite_unlocked(new_str.data(), sizeof(char),
-                                                new_str.size(), target_file);
-  // Propagate actual system error in FileIOResult.
-  if (write_result.has_error())
-    return -write_result.error;
-
-  // In case short write occured or error was not set on FileIOResult for some
-  // reason.
-  if (write_result.value != new_str.size() ||
-      internal::ferror_unlocked(target_file))
+  size_t written = internal::fwrite_unlocked(new_str.data(), sizeof(char),
+                                             new_str.size(), target_file);
+  if (written != new_str.size() || internal::ferror_unlocked(target_file))
     return FILE_WRITE_ERROR;
-
   return WRITE_OK;
 }
 
-LIBC_INLINE ErrorOr<size_t> vfprintf_internal(::FILE *__restrict stream,
-                                              const char *__restrict format,
-                                              internal::ArgList &args) {
+LIBC_INLINE int vfprintf_internal(::FILE *__restrict stream,
+                                  const char *__restrict format,
+                                  internal::ArgList &args) {
   constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
   printf_core::WriteBuffer<Mode<WriteMode::FLUSH_TO_STREAM>::value> wb(
       buffer, BUFF_SIZE, &file_write_hook, reinterpret_cast<void *>(stream));
   Writer writer(wb);
   internal::flockfile(stream);
-  auto retval = printf_main(&writer, format, args);
-  if (!retval.has_value()) {
-    internal::funlockfile(stream);
-    return retval;
-  }
+  int retval = printf_main(&writer, format, args);
   int flushval = wb.overflow_write("");
   if (flushval != WRITE_OK)
-    retval = Error(-flushval);
+    retval = flushval;
   internal::funlockfile(stream);
   return retval;
 }
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index 04b2bef05bc7b..efcff278bd284 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -29,11 +29,11 @@ LIBC_INLINE int convert_write_int(Writer<write_mode> *writer,
     return NULLPTR_WRITE_ERROR;
 #endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 
-  size_t written = writer->get_chars_written();
+  int written = writer->get_chars_written();
 
   switch (to_conv.length_modifier) {
   case LengthModifier::none:
-    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = static_cast<int>(written);
+    *reinterpret_cast<int *>(to_conv.conv_val_ptr) = written;
     break;
   case LengthModifier::l:
     *reinterpret_cast<long *>(to_conv.conv_val_ptr) = written;
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 9de108ece510f..1d4734a51b9b8 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -127,7 +127,7 @@ template <WriteMode write_mode> struct WriteBuffer {
 
 template <WriteMode write_mode> class Writer final {
   WriteBuffer<write_mode> &wb;
-  size_t chars_written = 0;
+  int chars_written = 0;
 
   LIBC_INLINE int pad(char new_char, size_t length) {
     // First, fill as much of the buffer as possible with the padding char.
@@ -161,7 +161,7 @@ template <WriteMode write_mode> class Writer final {
   // Takes a string, copies it into the buffer if there is space, else passes it
   // to the overflow mechanism to be handled separately.
   LIBC_INLINE int write(cpp::string_view new_string) {
-    chars_written += new_string.size();
+    chars_written += static_cast<int>(new_string.size());
     if (LIBC_LIKELY(wb.buff_cur + new_string.size() <= wb.buff_len)) {
       inline_memcpy(wb.buff + wb.buff_cur, new_string.data(),
                     new_string.size());
@@ -175,7 +175,7 @@ template <WriteMode write_mode> class Writer final {
   // if there is space, else calls pad which will loop and call the overflow
   // mechanism on a secondary buffer.
   LIBC_INLINE int write(char new_char, size_t length) {
-    chars_written += length;
+    chars_written += static_cast<int>(length);
 
     if (LIBC_LIKELY(wb.buff_cur + length <= wb.buff_len)) {
       inline_memset(wb.buff + wb.buff_cur, static_cast<unsigned char>(new_char),
@@ -199,7 +199,7 @@ template <WriteMode write_mode> class Writer final {
     return wb.overflow_write(char_string_view);
   }
 
-  LIBC_INLINE size_t get_chars_written() { return chars_written; }
+  LIBC_INLINE int get_chars_written() { return chars_written; }
 };
 
 // Class-template auto deduction helpers.
diff --git a/libc/src/stdio/snprintf.cpp b/libc/src/stdio/snprintf.cpp
index d95195f6f485f..c8940862f711f 100644
--- a/libc/src/stdio/snprintf.cpp
+++ b/libc/src/stdio/snprintf.cpp
@@ -8,12 +8,8 @@
 
 #include "src/stdio/snprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -36,21 +32,10 @@ LLVM_LIBC_FUNCTION(int, snprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/sprintf.cpp b/libc/src/stdio/sprintf.cpp
index 2a9b6ea7c5e50..7be97d3591aaf 100644
--- a/libc/src/stdio/sprintf.cpp
+++ b/libc/src/stdio/sprintf.cpp
@@ -10,10 +10,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -36,20 +33,9 @@ LLVM_LIBC_FUNCTION(int, sprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vasprintf.cpp b/libc/src/stdio/vasprintf.cpp
index bd77cd8864312..4a44d4a0f8842 100644
--- a/libc/src/stdio/vasprintf.cpp
+++ b/libc/src/stdio/vasprintf.cpp
@@ -7,11 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdio/vasprintf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/vasprintf_internal.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -22,17 +18,7 @@ LLVM_LIBC_FUNCTION(int, vasprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  auto ret_val = printf_core::vasprintf_internal(ret, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(ret_val.value());
+  return printf_core::vasprintf_internal(ret, format, args);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsnprintf.cpp b/libc/src/stdio/vsnprintf.cpp
index 5d936360c0857..b07a2499a0dd3 100644
--- a/libc/src/stdio/vsnprintf.cpp
+++ b/libc/src/stdio/vsnprintf.cpp
@@ -8,12 +8,8 @@
 
 #include "src/stdio/vsnprintf.h"
 
-#include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,21 +29,10 @@ LLVM_LIBC_FUNCTION(int, vsnprintf,
       wb(buffer, (buffsz > 0 ? buffsz - 1 : 0));
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   if (buffsz > 0) // if the buffsz is 0 the buffer may be a null pointer.
     wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/vsprintf.cpp b/libc/src/stdio/vsprintf.cpp
index f9cf8118534f6..26d497be42125 100644
--- a/libc/src/stdio/vsprintf.cpp
+++ b/libc/src/stdio/vsprintf.cpp
@@ -10,10 +10,7 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/arg_list.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdio/printf_core/printf_main.h"
 #include "src/stdio/printf_core/writer.h"
 
@@ -33,19 +30,9 @@ LLVM_LIBC_FUNCTION(int, vsprintf,
       wb(buffer, cpp::numeric_limits<size_t>::max());
   printf_core::Writer writer(wb);
 
-  auto ret_val = printf_core::printf_main(&writer, format, args);
-  if (!ret_val.has_value()) {
-    libc_errno = printf_core::internal_error_to_errno(ret_val.error());
-    return -1;
-  }
+  int ret_val = printf_core::printf_main(&writer, format, args);
   wb.buff[wb.buff_cur] = '\0';
-
-  if (ret_val.value() > static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(ret_val.value());
+  return ret_val;
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 1ccdcc8bec148..c464f82dcbda7 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -73,8 +73,6 @@ add_entrypoint_object(
     strfromf.h
   DEPENDS
     .str_from_util
-    libc.src.__support.CPP.limits
-    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -85,8 +83,6 @@ add_entrypoint_object(
     strfromd.h
   DEPENDS
     .str_from_util
-    libc.src.__support.CPP.limits
-    libc.src.stdio.printf_core.error_mapper
 )
 
 add_entrypoint_object(
@@ -97,8 +93,6 @@ add_entrypoint_object(
     strfroml.h
   DEPENDS
     .str_from_util
-    libc.src.__support.CPP.limits
-    libc.src.stdio.printf_core.error_mapper
 )
 
 add_header_library(
diff --git a/libc/src/stdlib/strfromd.cpp b/libc/src/stdlib/strfromd.cpp
index 71e257f08645b..f51e6d4c7f1df 100644
--- a/libc/src/stdlib/strfromd.cpp
+++ b/libc/src/stdlib/strfromd.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromd.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -39,13 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromd,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  if (writer.get_chars_written() >
-      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(writer.get_chars_written());
+  return writer.get_chars_written();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfromf.cpp b/libc/src/stdlib/strfromf.cpp
index 65f242b200f18..14dbfdb25bab6 100644
--- a/libc/src/stdlib/strfromf.cpp
+++ b/libc/src/stdlib/strfromf.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfromf.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -39,13 +36,7 @@ LLVM_LIBC_FUNCTION(int, strfromf,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  if (writer.get_chars_written() >
-      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(writer.get_chars_written());
+  return writer.get_chars_written();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/strfroml.cpp b/libc/src/stdlib/strfroml.cpp
index 31668a0323c93..12f22a8a2fb65 100644
--- a/libc/src/stdlib/strfroml.cpp
+++ b/libc/src/stdlib/strfroml.cpp
@@ -7,10 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/strfroml.h"
-#include "src/__support/CPP/limits.h"
 #include "src/__support/macros/config.h"
-#include "src/stdio/printf_core/core_structs.h"
-#include "src/stdio/printf_core/error_mapper.h"
 #include "src/stdlib/str_from_util.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -44,13 +41,7 @@ LLVM_LIBC_FUNCTION(int, strfroml,
   if (n > 0)
     wb.buff[wb.buff_cur] = '\0';
 
-  if (writer.get_chars_written() >
-      static_cast<size_t>(cpp::numeric_limits<int>::max())) {
-    libc_errno =
-        printf_core::internal_error_to_errno(-printf_core::OVERFLOW_ERROR);
-    return -1;
-  }
-  return static_cast<int>(writer.get_chars_written());
+  return writer.get_chars_written();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/time/strftime_core/strftime_main.h b/libc/src/time/strftime_core/strftime_main.h
index 2b136d83234cd..c7e590627094a 100644
--- a/libc/src/time/strftime_core/strftime_main.h
+++ b/libc/src/time/strftime_core/strftime_main.h
@@ -36,8 +36,7 @@ int strftime_main(printf_core::Writer<write_mode> *writer,
       return result;
   }
 
-  // TODO: Use ErrorOr<size_t>
-  return static_cast<int>(writer->get_chars_written());
+  return writer->get_chars_written();
 }
 
 } // namespace strftime_core
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index d71f1dff11943..eec108bc12ca5 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -186,8 +186,6 @@ add_libc_test(
     fprintf_test.cpp
   DEPENDS
     libc.src.stdio.fprintf
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
     ${fprintf_test_deps}
   COMPILE_OPTIONS
     ${use_system_file}
diff --git a/libc/test/src/stdio/fprintf_test.cpp b/libc/test/src/stdio/fprintf_test.cpp
index 2cea7f554ce38..6799323cc6ad9 100644
--- a/libc/test/src/stdio/fprintf_test.cpp
+++ b/libc/test/src/stdio/fprintf_test.cpp
@@ -15,9 +15,6 @@
 
 #include "src/stdio/fprintf.h"
 
-#include "src/__support/CPP/limits.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -34,8 +31,6 @@ using ::fread;
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
 } // namespace printf_test
 
-using LlvmLibcFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
 TEST(LlvmLibcFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("fprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -83,25 +78,6 @@ TEST(LlvmLibcFPrintfTest, WriteToFile) {
   written =
       LIBC_NAMESPACE::fprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
-  ASSERT_ERRNO_FAILURE();
-
-  ASSERT_EQ(printf_test::fclose(file), 0);
-}
-
-#if !defined(LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS) &&                            \
-    !defined(LIBC_COPT_PRINTF_DISABLE_WRITE_INT)
-TEST(LlvmLibcFPrintfTest, NullPtrCheck) {
-  const char *FILENAME = APPEND_LIBC_TEST("fprintf_nullptr.test");
-  auto FILE_PATH = libc_make_test_file_path(FILENAME);
-
-  ::FILE *file = printf_test::fopen(FILE_PATH, "w");
-  ASSERT_FALSE(file == nullptr);
-
-  int ret =
-      LIBC_NAMESPACE::fprintf(file, "hello %n", static_cast<int *>(nullptr));
-  EXPECT_LT(ret, 0);
-  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
-#endif // LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index 2dae2a22c864c..bf088937e4104 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcPrintfConverterTest, SimpleRawConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "abc");
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PercentConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "%");
-  ASSERT_EQ(writer.get_chars_written(), size_t{1});
+  ASSERT_EQ(writer.get_chars_written(), 1);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "D");
-  ASSERT_EQ(writer.get_chars_written(), size_t{1});
+  ASSERT_EQ(writer.get_chars_written(), 1);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
@@ -85,7 +85,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "   E");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcPrintfConverterTest, CharConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "F   ");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
@@ -118,7 +118,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "DEF");
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
@@ -133,7 +133,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "456");
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
@@ -148,7 +148,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionLow) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "xy");
-  ASSERT_EQ(writer.get_chars_written(), size_t{2});
+  ASSERT_EQ(writer.get_chars_written(), 2);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
@@ -163,7 +163,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionRightJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, " 789");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
@@ -180,7 +180,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionLeftJustified) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "ghi ");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
@@ -194,7 +194,7 @@ TEST_F(LlvmLibcPrintfConverterTest, IntConversionSimple) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "12345");
-  ASSERT_EQ(writer.get_chars_written(), size_t{5});
+  ASSERT_EQ(writer.get_chars_written(), 5);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
@@ -211,7 +211,7 @@ TEST_F(LlvmLibcPrintfConverterTest, HexConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x00000000123456ab");
-  ASSERT_EQ(writer.get_chars_written(), size_t{18});
+  ASSERT_EQ(writer.get_chars_written(), 18);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
@@ -225,7 +225,7 @@ TEST_F(LlvmLibcPrintfConverterTest, BinaryConversion) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ(str, "101010");
-  ASSERT_EQ(writer.get_chars_written(), size_t{6});
+  ASSERT_EQ(writer.get_chars_written(), 6);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
@@ -239,7 +239,7 @@ TEST_F(LlvmLibcPrintfConverterTest, PointerConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "0x123456ab");
-  ASSERT_EQ(writer.get_chars_written(), size_t{10});
+  ASSERT_EQ(writer.get_chars_written(), 10);
 }
 
 TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
@@ -253,5 +253,5 @@ TEST_F(LlvmLibcPrintfConverterTest, OctConversion) {
 
   wb.buff[wb.buff_cur] = '\0';
   ASSERT_STREQ(str, "1234");
-  ASSERT_EQ(writer.get_chars_written(), size_t{4});
+  ASSERT_EQ(writer.get_chars_written(), 4);
 }
diff --git a/libc/test/src/stdio/printf_core/writer_test.cpp b/libc/test/src/stdio/printf_core/writer_test.cpp
index d263cf55aa474..d036341be7981 100644
--- a/libc/test/src/stdio/printf_core/writer_test.cpp
+++ b/libc/test/src/stdio/printf_core/writer_test.cpp
@@ -39,7 +39,7 @@ TEST(LlvmLibcPrintfWriterTest, Write) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abc", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
@@ -53,7 +53,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF123", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{9});
+  ASSERT_EQ(writer.get_chars_written(), 9);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteChars) {
@@ -66,7 +66,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteChars) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaa", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{3});
+  ASSERT_EQ(writer.get_chars_written(), 3);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
@@ -80,7 +80,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsMultipleTimes) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDDD111", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{9});
+  ASSERT_EQ(writer.get_chars_written(), 9);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
@@ -102,7 +102,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteManyChars) {
                "ZZZZZZZZZZ"
                "ZZZZZZZZZ",
                str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{99});
+  ASSERT_EQ(writer.get_chars_written(), 99);
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
@@ -117,7 +117,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWrites) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
@@ -129,7 +129,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("abcDEF1234", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
@@ -141,7 +141,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("1111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{15});
+  ASSERT_EQ(writer.get_chars_written(), 15);
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
@@ -157,7 +157,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLength) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("aaaDEF1114", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
@@ -175,7 +175,7 @@ TEST(LlvmLibcPrintfWriterTest, StringWithMaxLengthOne) {
   wb.buff[wb.buff_cur] = '\0';
 
   ASSERT_STREQ("", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
@@ -187,7 +187,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLength) {
   writer.write('1', 3);
   writer.write({"456", 3});
 
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 struct OutBuff {
@@ -226,7 +226,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("abcDEF123456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
@@ -246,7 +246,7 @@ TEST(LlvmLibcPrintfWriterTest, WriteCharsWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("111111111111111", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{15});
+  ASSERT_EQ(writer.get_chars_written(), 15);
 }
 
 TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
@@ -269,7 +269,7 @@ TEST(LlvmLibcPrintfWriterTest, MixedWriteWithMaxLengthWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
@@ -292,7 +292,7 @@ TEST(LlvmLibcPrintfWriterTest, ZeroLengthBufferWithCallback) {
   str[out_buff.cur_pos] = '\0';
 
   ASSERT_STREQ("aaaDEF111456", str);
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
 }
 
 TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
@@ -312,7 +312,7 @@ TEST(LlvmLibcPrintfWriterTest, NullStringWithZeroMaxLengthWithCallback) {
   wb.overflow_write("");
   str[out_buff.cur_pos] = '\0';
 
-  ASSERT_EQ(writer.get_chars_written(), size_t{12});
+  ASSERT_EQ(writer.get_chars_written(), 12);
   ASSERT_STREQ("aaaDEF111456", str);
 }
 
diff --git a/libc/test/src/stdio/snprintf_test.cpp b/libc/test/src/stdio/snprintf_test.cpp
index 95507e0885dbf..baaa664cdc9ee 100644
--- a/libc/test/src/stdio/snprintf_test.cpp
+++ b/libc/test/src/stdio/snprintf_test.cpp
@@ -8,12 +8,8 @@
 
 #include "src/stdio/snprintf.h"
 
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcSNPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
 // The sprintf test cases cover testing the shared printf functionality, so
 // these tests will focus on snprintf exclusive features.
 
@@ -63,14 +59,3 @@ TEST(LlvmLibcSNPrintfTest, NoCutOff) {
   EXPECT_EQ(written, 10);
   ASSERT_STREQ(buff, "1234567890");
 }
-
-TEST(LlvmLibcSNPrintfTest, CharsWrittenOverflow) {
-  char buff[0];
-
-  // Trigger an overflow in the return value of snprintf by writing more than
-  // INT_MAX bytes.
-  int int_max = LIBC_NAMESPACE::cpp::numeric_limits<int>::max();
-  int written = LIBC_NAMESPACE::snprintf(buff, 0, "%*stest", int_max, "");
-  EXPECT_LT(written, 0);
-  ASSERT_ERRNO_FAILURE();
-}
diff --git a/libc/test/src/stdio/vfprintf_test.cpp b/libc/test/src/stdio/vfprintf_test.cpp
index 9b5f09db8fd41..f50565a0f68ca 100644
--- a/libc/test/src/stdio/vfprintf_test.cpp
+++ b/libc/test/src/stdio/vfprintf_test.cpp
@@ -19,8 +19,6 @@
 
 #include "src/stdio/vfprintf.h"
 
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace printf_test {
@@ -46,8 +44,6 @@ int call_vfprintf(::FILE *__restrict stream, const char *__restrict format,
   return ret;
 }
 
-using LlvmLibcVFPrintfTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
 TEST(LlvmLibcVFPrintfTest, WriteToFile) {
   const char *FILENAME = APPEND_LIBC_TEST("vfprintf_output.test");
   auto FILE_PATH = libc_make_test_file_path(FILENAME);
@@ -94,7 +90,6 @@ TEST(LlvmLibcVFPrintfTest, WriteToFile) {
 
   written = call_vfprintf(file, "Writing to a read only file should fail.");
   EXPECT_LT(written, 0);
-  ASSERT_ERRNO_EQ(EBADF);
 
   ASSERT_EQ(printf_test::fclose(file), 0);
 }
diff --git a/libc/test/src/stdlib/StrfromTest.h b/libc/test/src/stdlib/StrfromTest.h
index fd2e0f120e90e..e82c94499aa11 100644
--- a/libc/test/src/stdlib/StrfromTest.h
+++ b/libc/test/src/stdlib/StrfromTest.h
@@ -8,8 +8,6 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #define ASSERT_STREQ_LEN(actual_written, actual_str, expected_str)             \
@@ -17,7 +15,7 @@
   EXPECT_STREQ(actual_str, expected_str);
 
 template <typename InputT>
-class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class StrfromTest : public LIBC_NAMESPACE::testing::Test {
 
   static constexpr bool is_single_prec =
       LIBC_NAMESPACE::cpp::is_same<InputT, float>::value;
@@ -483,16 +481,6 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     written = func(buff, 10, "%A", -ld_nan);
     ASSERT_STREQ_LEN(written, buff, "-NAN");
   }
-
-  void charsWrittenOverflow(FunctionT func) {
-    char buff[100];
-    // Trigger an overflow in the return value of strfrom by writing more than
-    // INT_MAX bytes.
-    int result = func(buff, sizeof(buff), "%.2147483647f", 1.0f);
-
-    EXPECT_LT(result, 0);
-    ASSERT_ERRNO_FAILURE();
-  }
 };
 
 #define STRFROM_TEST(InputType, name, func)                                    \
@@ -513,7 +501,4 @@ class StrfromTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
   TEST_F(LlvmLibc##name##Test, InsufficientBufferSize) {                       \
     insufficentBufsize(func);                                                  \
   }                                                                            \
-  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }        \
-  TEST_F(LlvmLibc##name##Test, CharsWrittenOverflow) {                         \
-    charsWrittenOverflow(func);                                                \
-  }
+  TEST_F(LlvmLibc##name##Test, InfAndNanValues) { infNanValues(func); }

From cdf52a1325e6329cef267ca93886481ddc5d554d Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Wed, 5 Nov 2025 03:52:25 +0000
Subject: [PATCH 274/313] [msan][NFCI] Generalize handleVectorPmaddIntrinsic()
 (#166282)

This generalizes `handleVectorPmaddIntrinsic()`:
- potentially handle floating-point type intrinsics (e.g.,
`llvm.x86.avx512bf16.dpbf16ps.512`). This usage is not enabled yet.
- "multiplication with an initialized zero guarantees that the
corresponding output becomes initialized" is now gated by a parameter
---
 .../Instrumentation/MemorySanitizer.cpp       | 77 ++++++++++++-------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 471c6ec633a57..ceeece41782f4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //          adding/"accumulating" %s. "Accumulation" stores the result in one
   //          of the source registers, but this accumulate vs. add distinction
   //          is lost when dealing with LLVM intrinsics.)
+  //
+  // ZeroPurifies means that multiplying a known-zero with an uninitialized
+  // value results in an initialized value. This is applicable for integer
+  // multiplication, but not floating-point (counter-example: NaN).
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+                                  bool ZeroPurifies,
                                   unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
 
@@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(AccumulatorType == ReturnType);
     }
 
-    FixedVectorType *ImplicitReturnType = ReturnType;
+    FixedVectorType *ImplicitReturnType =
+        cast<FixedVectorType>(getShadowTy(ReturnType));
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
       ImplicitReturnType = cast<FixedVectorType>(
@@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
              ReturnType->getNumElements() * ReductionFactor);
     }
 
-    // Multiplying an *initialized* zero by an uninitialized element results in
-    // an initialized zero element.
-    //
-    // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
-    // results in an unpoisoned value. We can therefore adapt the visitAnd()
-    // instrumentation:
-    //   OutShadow =   (SaNonZero & SbNonZero)
-    //               | (VaNonZero & SbNonZero)
-    //               | (SaNonZero & VbNonZero)
-    //   where non-zero is checked on a per-element basis (not per bit).
-    Value *SZero = Constant::getNullValue(Va->getType());
-    Value *VZero = Constant::getNullValue(Sa->getType());
-    Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
-    Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
-    Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
-    Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
-
-    Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
-    Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
-    Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
-
     // Each element of the vector is represented by a single bit (poisoned or
     // not) e.g., <8 x i1>.
-    Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    Value *SaNonZero = IRB.CreateIsNotNull(Sa);
+    Value *SbNonZero = IRB.CreateIsNotNull(Sb);
+    Value *And;
+    if (ZeroPurifies) {
+      // Multiplying an *initialized* zero by an uninitialized element results
+      // in an initialized zero element.
+      //
+      // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+      // results in an unpoisoned value. We can therefore adapt the visitAnd()
+      // instrumentation:
+      //   OutShadow =   (SaNonZero & SbNonZero)
+      //               | (VaNonZero & SbNonZero)
+      //               | (SaNonZero & VbNonZero)
+      //   where non-zero is checked on a per-element basis (not per bit).
+      Value *VaInt = Va;
+      Value *VbInt = Vb;
+      if (!Va->getType()->isIntegerTy()) {
+        VaInt = CreateAppToShadowCast(IRB, Va);
+        VbInt = CreateAppToShadowCast(IRB, Vb);
+      }
+
+      Value *VaNonZero = IRB.CreateIsNotNull(VaInt);
+      Value *VbNonZero = IRB.CreateIsNotNull(VbInt);
+
+      Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+      Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+      Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+      And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    } else {
+      And = IRB.CreateOr({SaNonZero, SbNonZero});
+    }
 
     // Extend <8 x i1> to <8 x i16>.
     // (The real pmadd intrinsic would have computed intermediate values of
@@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
     case Intrinsic::x86_avx512_pmaddubs_w_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true);
       break;
 
     // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
       break;
 
     // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
 
     // AVX Vector Neural Network Instructions: bytes
@@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_vpdpbuuds_128:
     case Intrinsic::x86_avx2_vpdpbuuds_256:
     case Intrinsic::x86_avx10_vpdpbuuds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
       break;
 
     // AVX Vector Neural Network Instructions: words
@@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
 
       // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single

From b5f200129ad96f87bce11e5c8f0eafeb00b70b9c Mon Sep 17 00:00:00 2001
From: Vigneshwar Jayakumar <vigneshwar.jayakumar@amd.com>
Date: Tue, 4 Nov 2025 22:40:40 -0600
Subject: [PATCH 275/313] [CodeGen] Register-coalescer remat fix subreg
 liveness (#165662)

This is a bugfix in rematerialization where the liveness of subreg mask
was incorrectly updated causing crash in scheduler.
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |  24 ++--
 .../AMDGPU/reg-coalescer-subreg-liveness.mir  | 131 ++++++++++++++++++
 .../SystemZ/regcoal_remat_empty_subrange.ll   |   4 +-
 3 files changed, 149 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 38f6deb39ddf3..99f76936a180f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
       SlotIndex DefIndex =
           CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
       VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+
+      // Refine the subranges that are now defined by the remat.
+      // This will split existing subranges if necessary.
+      DstInt.refineSubRanges(
+          Alloc, DstMask,
+          [&DefIndex, &Alloc](LiveInterval::SubRange &SR) {
+            // We know that this lane is defined by this instruction,
+            // but at this point it might not be live because it was not defined
+            // by the original instruction. This happens when the
+            // rematerialization widens the defined register. Assign that lane a
+            // dead def so that the interferences are properly modeled.
+            if (!SR.liveAt(DefIndex))
+              SR.createDeadDef(DefIndex, Alloc);
+          },
+          *LIS->getSlotIndexes(), *TRI);
+
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
         if ((SR.LaneMask & DstMask).none()) {
           LLVM_DEBUG(dbgs()
@@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
           // updateRegDefUses. The original subrange def may have only undefed
           // some lanes.
           UpdatedSubRanges = true;
-        } else {
-          // We know that this lane is defined by this instruction,
-          // but at this point it might not be live because it was not defined
-          // by the original instruction. This happens when the
-          // rematerialization widens the defined register. Assign that lane a
-          // dead def so that the interferences are properly modeled.
-          if (!SR.liveAt(DefIndex))
-            SR.createDeadDef(DefIndex, Alloc);
         }
       }
       if (UpdatedSubRanges)
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
new file mode 100644
index 0000000000000..381cb8c9d1047
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s
+
+# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask.
+
+---
+name:            reg_coalescer_subreg_liveness
+tracksRegLiveness: true
+liveins:
+body:             |
+  ; CHECK-LABEL: name: reg_coalescer_subreg_liveness
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   $vcc_lo = COPY $exec_lo
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sreg_32 = S_MOV_B32 1
+    undef %3.sub0:sgpr_128 = COPY %2
+    %4:sreg_32 = S_MOV_B32 0
+    undef %5.sub0:sgpr_256 = COPY %4
+    TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    %6:sgpr_128 = COPY killed %3
+    %6.sub1:sgpr_128 = COPY killed %1
+    %7:sreg_32 = COPY $exec_lo
+    %8:sreg_32 = COPY %2
+    %9:sreg_32 = COPY %4
+
+  bb.1:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+    %10:sreg_32 = COPY killed %8
+    undef %11.sub0:sgpr_128 = COPY %2
+    %11.sub1:sgpr_128 = COPY killed %10
+    %11.sub2:sgpr_128 = COPY %2
+    %11.sub3:sgpr_128 = COPY %2
+    TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    %12:sreg_32 = COPY killed %9
+    %13:sgpr_128 = COPY %6
+    %13.sub2:sgpr_128 = COPY killed %12
+    TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    $vcc_lo = COPY %7
+    %8:sreg_32 = COPY %4
+    %9:sreg_32 = COPY %2
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+---
+name:            reg_coalescer_subreg_liveness_2
+tracksRegLiveness: true
+liveins:
+body:             |
+  ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %3:sreg_32 = S_MOV_B32 1
+    undef %4.sub0:sgpr_128 = COPY %3
+    %5:sgpr_128 = COPY %4
+    %5.sub1:sgpr_128 = COPY killed %2
+    %6:sgpr_128 = COPY %5
+    %6.sub2:sgpr_128 = COPY killed %1
+    %7:sreg_32 = S_MOV_B32 0
+    undef %8.sub0:sgpr_256 = COPY %7
+    %9:sreg_32 = COPY %3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %10:sreg_32 = COPY killed %9
+    undef %11.sub0:sgpr_128 = COPY %3
+    %11.sub1:sgpr_128 = COPY killed %10
+    S_NOP 0, implicit %5, implicit %8
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
index 678d9a9073155..ff9b6a34c1d53 100644
--- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
+++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
@@ -22,10 +22,10 @@ define void @main(i16 %in) {
 ; CHECK-NEXT:    locghile %r3, 1
 ; CHECK-NEXT:    o %r0, 0(%r1)
 ; CHECK-NEXT:    larl %r1, g_222
-; CHECK-NEXT:    lghi %r5, 0
 ; CHECK-NEXT:    dsgfr %r2, %r0
+; CHECK-NEXT:    lghi %r3, 0
 ; CHECK-NEXT:    stgrl %r2, g_39
-; CHECK-NEXT:    stc %r5, 19(%r1)
+; CHECK-NEXT:    stc %r3, 19(%r1)
 ; CHECK-NEXT:    br %r14
   %tmp = load i32, ptr @g_151, align 4
   %tmp3 = or i32 %tmp, 1

From 6111ff16df372fd78cf9dc8fa953573ea2446f82 Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Wed, 5 Nov 2025 10:22:11 +0530
Subject: [PATCH 276/313] [RISCV] Implement shouldFoldMaskToVariableShiftPair
 (#166159)

Folding a mask to a variable shift pair results in better code size as
long as they are scalars that are <= XLen.

Similar to https://github.com/llvm/llvm-project/pull/158069
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   9 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   2 +
 .../test/CodeGen/RISCV/mask-variable-shift.ll | 132 ++++++++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/mask-variable-shift.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b8605629e2dfe..c3f100e3197b1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -25309,3 +25309,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const {
   }
   return {};
 }
+
+bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  if (VT.isVector())
+    return false;
+
+  return VT.getSizeInBits() <= Subtarget.getXLen();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9e3e2a9443625..dd62a9cf6c9e2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -465,6 +465,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
 
+  bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
   /// Match a mask which "spreads" the leading elements of a vector evenly
   /// across the result.  Factor is the spread amount, and Index is the
   /// offset applied.
diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
new file mode 100644
index 0000000000000..4e73cee30ef08
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; RV32-LABEL: mask_pair:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srl a0, a0, a1
+; RV32-NEXT:    sll a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srlw a0, a0, a1
+; RV64-NEXT:    sllw a0, a0, a1
+; RV64-NEXT:    ret
+  %shl = shl nsw i32 -1, %y
+  %and = and i32 %shl, %x
+  ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; RV32-LABEL: mask_pair_64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a3, -1
+; RV32-NEXT:    addi a4, a2, -32
+; RV32-NEXT:    sll a3, a3, a2
+; RV32-NEXT:    bltz a4, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:    j .LBB1_3
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    lui a5, 524288
+; RV32-NEXT:    addi a5, a5, -1
+; RV32-NEXT:    srl a2, a5, a2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:  .LBB1_3:
+; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    and a3, a4, a3
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair_64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srl a0, a0, a1
+; RV64-NEXT:    sll a0, a0, a1
+; RV64-NEXT:    ret
+  %shl = shl nsw i64 -1, %y
+  %and = and i64 %shl, %x
+  ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) {
+; RV32-LABEL: mask_pair_128:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    li a6, -1
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    sw zero, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    addi a7, sp, 16
+; RV32-NEXT:    sw a6, 16(sp)
+; RV32-NEXT:    sw a6, 20(sp)
+; RV32-NEXT:    sw a6, 24(sp)
+; RV32-NEXT:    sw a6, 28(sp)
+; RV32-NEXT:    srli a6, a2, 3
+; RV32-NEXT:    andi a6, a6, 12
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    lw a7, 4(a6)
+; RV32-NEXT:    lw t0, 8(a6)
+; RV32-NEXT:    lw t1, 12(a6)
+; RV32-NEXT:    lw a6, 0(a6)
+; RV32-NEXT:    andi t2, a2, 31
+; RV32-NEXT:    xori t2, t2, 31
+; RV32-NEXT:    sll t1, t1, a2
+; RV32-NEXT:    srli t3, t0, 1
+; RV32-NEXT:    sll t0, t0, a2
+; RV32-NEXT:    srli t4, a7, 1
+; RV32-NEXT:    sll a7, a7, a2
+; RV32-NEXT:    sll a2, a6, a2
+; RV32-NEXT:    srli a6, a6, 1
+; RV32-NEXT:    srl t3, t3, t2
+; RV32-NEXT:    srl t4, t4, t2
+; RV32-NEXT:    srl a6, a6, t2
+; RV32-NEXT:    and a2, a2, a5
+; RV32-NEXT:    or a5, t1, t3
+; RV32-NEXT:    or t0, t0, t4
+; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    and a4, a6, a4
+; RV32-NEXT:    and a3, t0, a3
+; RV32-NEXT:    and a1, a5, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair_128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a5, -1
+; RV64-NEXT:    addi a4, a2, -64
+; RV64-NEXT:    sll a3, a5, a2
+; RV64-NEXT:    bltz a4, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    j .LBB2_3
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a5, 1
+; RV64-NEXT:    srl a2, a5, a2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:  .LBB2_3:
+; RV64-NEXT:    srai a4, a4, 63
+; RV64-NEXT:    and a3, a4, a3
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    ret
+  %shl = shl nsw i128 -1, %y
+  %and = and i128 %shl, %x
+  ret i128 %and
+}

From 849038cad16f18d77b5cd277980c93e8efbf1bbc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 20:53:42 -0800
Subject: [PATCH 277/313] AMDGPU: Do not infer implicit inputs for !nocallback
 intrinsics  (#131759)

This isn't really the right check, we want to know that the intrinsic
does not perform a true function call to any code (in the module or
not). nocallback
appears to be the closest thing to this property we have now though.
Fixes theoretically
miscompiles with intrinsics like statepoint, which hide a call to a real
function.

Also do the same for inferring no-agpr usage.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 27 ++++++-
 ...attributor-intrinsic-missing-nocallback.ll | 31 ++++++++
 ...amdgpu-attributor-nocallback-intrinsics.ll | 74 +++++++++++++++++++
 3 files changed, 128 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 8669978637f40..56ab040706a13 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
 
 enum ImplicitArgumentMask {
-  NOT_IMPLICIT_INPUT = 0,
+  UNKNOWN_INTRINSIC = 0,
 #include "AMDGPUAttributes.def"
-  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+  NOT_IMPLICIT_INPUT
 };
 
 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
     return QUEUE_PTR;
   default:
-    return NOT_IMPLICIT_INPUT;
+    return UNKNOWN_INTRINSIC;
   }
 }
 
@@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       ImplicitArgumentMask AttrMask =
           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                               HasApertureRegs, SupportsGetDoorbellID, COV);
+
+      if (AttrMask == UNKNOWN_INTRINSIC) {
+        // Assume not-nocallback intrinsics may invoke a function which accesses
+        // implicit arguments.
+        //
+        // FIXME: This isn't really the correct check. We want to ensure it
+        // isn't calling any function that may use implicit arguments regardless
+        // of whether it's internal to the module or not.
+        //
+        // TODO: Ignoring callsite attributes.
+        if (!Callee->hasFnAttribute(Attribute::NoCallback))
+          return indicatePessimisticFixpoint();
+        continue;
+      }
+
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
@@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-        return true;
+
+        // Assume !nocallback intrinsics may call a function which requires
+        // AGPRs.
+        return CB.hasFnAttr(Attribute::NoCallback);
       }
 
       // TODO: Handle callsite attributes
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
new file mode 100644
index 0000000000000..d7d623ac89146
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Make sure we do not infer anything about implicit inputs through an
+; intrinsic call which is not nocallback.
+
+declare zeroext i32 @return_i32()
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test_i32_return(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
+; CHECK-NEXT:    ret i32 [[CALL1]]
+;
+entry:
+  %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+  ret i32 %call1
+}
+
+declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
+declare i32 @llvm.experimental.gc.result.i32(token) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
new file mode 100644
index 0000000000000..71c509afa8e64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
+
+; Make sure we infer no inputs are used through some intrinsics
+
+define void @use_fake_use(i32 %arg) {
+; CHECK-LABEL: define void @use_fake_use(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void (...) @llvm.fake.use(i32 %arg)
+  ret void
+}
+
+define void @use_donothing() {
+; CHECK-LABEL: define void @use_donothing(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.donothing()
+  ret void
+}
+
+define void @use_assume(i1 %arg) {
+; CHECK-LABEL: define void @use_assume(
+; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 %arg)
+  ret void
+}
+
+define void @use_trap() {
+; CHECK-LABEL: define void @use_trap(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap()
+  ret void
+}
+
+define void @use_debugtrap() {
+; CHECK-LABEL: define void @use_debugtrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.debugtrap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.debugtrap()
+  ret void
+}
+
+define void @use_ubsantrap() {
+; CHECK-LABEL: define void @use_ubsantrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.ubsantrap(i8 0)
+  ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+;.

From 044e0f041d5c02ae3b4e44bab4647008ff497d9f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 4 Nov 2025 21:11:03 -0800
Subject: [PATCH 278/313] Revert "IR: Remove null UseList checks in hasNUses
 methods (#165929)" (#166500)

This reverts commit 93e860e694770f52a9eeecda88ba11173c291ef8.

hasOneUse still has the null check, and it seems bad to be logically
inconsistent across multiple of these predicate functions.
---
 llvm/lib/IR/Value.cpp               |  8 +++++++
 llvm/unittests/IR/ConstantsTest.cpp | 36 ++++++++---------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 95d61a987f6c1..b775cbb0c7920 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -148,10 +148,18 @@ void Value::destroyValueName() {
 }
 
 bool Value::hasNUses(unsigned N) const {
+  if (!UseList)
+    return N == 0;
+
+  // TODO: Disallow for ConstantData and remove !UseList check?
   return hasNItems(use_begin(), use_end(), N);
 }
 
 bool Value::hasNUsesOrMore(unsigned N) const {
+  // TODO: Disallow for ConstantData and remove !UseList check?
+  if (!UseList)
+    return N == 0;
+
   return hasNItemsOrMore(use_begin(), use_end(), N);
 }
 
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 9cb9e1236b2d1..6376165cbe766 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -29,8 +29,13 @@ TEST(ConstantsTest, UseCounts) {
 
   EXPECT_TRUE(Zero->use_empty());
   EXPECT_EQ(Zero->getNumUses(), 0u);
+  EXPECT_TRUE(Zero->hasNUses(0));
   EXPECT_FALSE(Zero->hasOneUse());
   EXPECT_FALSE(Zero->hasOneUser());
+  EXPECT_FALSE(Zero->hasNUses(1));
+  EXPECT_FALSE(Zero->hasNUsesOrMore(1));
+  EXPECT_FALSE(Zero->hasNUses(2));
+  EXPECT_FALSE(Zero->hasNUsesOrMore(2));
 
   std::unique_ptr<Module> M(new Module("MyModule", Context));
 
@@ -45,36 +50,15 @@ TEST(ConstantsTest, UseCounts) {
   // Still looks like use_empty with uses.
   EXPECT_TRUE(Zero->use_empty());
   EXPECT_EQ(Zero->getNumUses(), 0u);
+  EXPECT_TRUE(Zero->hasNUses(0));
   EXPECT_FALSE(Zero->hasOneUse());
   EXPECT_FALSE(Zero->hasOneUser());
+  EXPECT_FALSE(Zero->hasNUses(1));
+  EXPECT_FALSE(Zero->hasNUsesOrMore(1));
+  EXPECT_FALSE(Zero->hasNUses(2));
+  EXPECT_FALSE(Zero->hasNUsesOrMore(2));
 }
 
-#ifdef GTEST_HAS_DEATH_TEST
-#ifndef NDEBUG
-
-TEST(ConstantsTest, hasNUsesInvalid) {
-  LLVMContext Context;
-  Type *Int32Ty = Type::getInt32Ty(Context);
-  Constant *Zero = ConstantInt::get(Int32Ty, 0);
-  std::unique_ptr<Module> M(new Module("MyModule", Context));
-
-  // Introduce some uses
-  new GlobalVariable(*M, Int32Ty, /*isConstant=*/false,
-                     GlobalValue::ExternalLinkage, /*Initializer=*/Zero,
-                     "gv_user0");
-  new GlobalVariable(*M, Int32Ty, /*isConstant=*/false,
-                     GlobalValue::ExternalLinkage, /*Initializer=*/Zero,
-                     "gv_user1");
-
-  for (int I = 0; I != 3; ++I) {
-    EXPECT_DEATH(Zero->hasNUses(I), "hasUseList\\(\\)");
-    EXPECT_DEATH(Zero->hasNUsesOrMore(I), "hasUseList\\(\\)");
-  }
-}
-
-#endif
-#endif
-
 TEST(ConstantsTest, Integer_i1) {
   LLVMContext Context;
   IntegerType *Int1 = IntegerType::get(Context, 1);

From 87b1d3537ae6adcb0a16cc0fa2749862d01009d3 Mon Sep 17 00:00:00 2001
From: LU-JOHN <John.Lu@amd.com>
Date: Tue, 4 Nov 2025 23:18:40 -0600
Subject: [PATCH 279/313] [AMDGPU][NFC] Avoid copying MachineOperands (#166293)

Avoid copying machine operands.

Signed-off-by: John Lu <John.Lu@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 6 +++---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp     | 2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp               | 8 ++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.h                 | 2 +-
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp         | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 957d7164b686e..15ed60b46a9c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -6764,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(2);
   std::optional<int64_t> BarValImm =
       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
 
@@ -6817,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(1);
-  MachineOperand CntOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(1);
+  const MachineOperand &CntOp = I.getOperand(2);
 
   // BarID = (BarOp >> 4) & 0x3F
   Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..62172a0bb89db 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
           continue;
 
         if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
-          MachineOperand DefSrcMO = Def.getOperand(1);
+          const MachineOperand &DefSrcMO = Def.getOperand(1);
 
           // Immediates are not an issue and can be propagated in
           // postrapseudos pass. Only handle cases where defining
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 45f591927b86e..9460145d47111 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7945,7 +7945,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     }
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
@@ -7985,7 +7985,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     legalizeOperandsVALUt16(*NewInstr, MRI);
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
@@ -8183,7 +8183,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
                                    AMDGPU::OpName::src0_modifiers) >= 0)
       NewInstr.addImm(0);
     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
-      MachineOperand Src = Inst.getOperand(1);
+      const MachineOperand &Src = Inst.getOperand(1);
       NewInstr->addOperand(Src);
     }
 
@@ -9199,7 +9199,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
                                                SIInstrWorklist &Worklist,
                                                Register NewCond) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21f959ce..0643b532ea04c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -172,7 +172,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
                                     SIInstrWorklist &Worklist) const;
 
-  void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+  void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
                                     SIInstrWorklist &Worklist,
                                     Register NewCond = Register()) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index abefa32b8f802..8785968569d92 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 }
 
 void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
-  MachineOperand DstOp = I.getOperand(0);
+  const MachineOperand &DstOp = I.getOperand(0);
 
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
   assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&

From 4c2a9c4ba3796799032c12673510279d51c65370 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Wed, 5 Nov 2025 05:47:35 +0000
Subject: [PATCH 280/313] [msan][test] Add some avx512bf16 tests (#166219)

Forked from llvm/test/CodeGen/X86
---
 .../X86/avx512bf16-intrinsics.ll              | 355 ++++++++
 .../MemorySanitizer/X86/avx512bf16-mov.ll     | 123 +++
 .../X86/avx512bf16-vl-intrinsics.ll           | 774 ++++++++++++++++++
 3 files changed, 1252 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
new file mode 100644
index 0000000000000..877fe5fe4b393
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3
+
+define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <32 x bfloat> %0 to <8 x i64>
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast i32 %U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  %3 = bitcast <32 x bfloat> %2 to <8 x i64>
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(
+; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <8 x i64> %C to <32 x bfloat>
+  %2 = bitcast i32 %U to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1
+  %4 = bitcast <32 x bfloat> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3
+
+define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3
+
+define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  ret <16 x float> %0
+}
+
+define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
+  ret <16 x float> %2
+}
+define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512(
+; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E
+  ret <16 x float> %2
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
new file mode 100644
index 0000000000000..ac65645a9ec2c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll
+;
+; Strictly handled: (none)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory {
+; CHECK-LABEL: define dso_local void @funbf16(
+; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]]
+; CHECK:       [[BB19]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB20]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32
+; CHECK-NEXT:    store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB31]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1
+; CHECK-NEXT:    store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK:       [[BB35]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
+; CHECK-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
+; CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32
+; CHECK-NEXT:    store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x bfloat>, ptr %src, align 1
+  store <8 x bfloat> %0, ptr %dst, align 1
+  %1 = load <8 x bfloat>, ptr %src, align 32
+  store <8 x bfloat> %1, ptr %dst, align 32
+  %2 = load <16 x bfloat>, ptr %src, align 1
+  store <16 x bfloat> %2, ptr %dst, align 1
+  %3 = load <16 x bfloat>, ptr %src, align 32
+  store <16 x bfloat> %3, ptr %dst, align 32
+  ret void
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
new file mode 100644
index 0000000000000..904614e961d6c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
@@ -0,0 +1,774 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B)
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B)
+; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B)
+; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1
+
+define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3
+
+define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3
+
+define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP10]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP14]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <2 x i64> %C to <8 x bfloat>
+  %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP16]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3
+
+define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  ret <8 x float> %0
+}
+
+define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
+  ret <8 x float> %2
+}
+define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256(
+; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E
+  ret <8 x float> %2
+}
+
+declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3
+
+define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  ret <4 x float> %0
+}
+
+define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128(
+; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E
+  ret <4 x float> %2
+}
+
+define <16 x i16> @test_no_vbroadcast1() sanitize_memory {
+; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = bitcast <8 x bfloat> %0 to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %2
+}
+
+define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory {
+; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x bfloat> [[TMP1]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %1
+}
+
+define <16 x i32> @pr83358() sanitize_memory {
+; CHECK-LABEL: define <16 x i32> @pr83358(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+  %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %3
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.

From 19a9de06cd557da2289c07db0228ec8987d26dc4 Mon Sep 17 00:00:00 2001
From: yicuixi <63290259+yicuixi@users.noreply.github.com>
Date: Wed, 5 Nov 2025 14:43:40 +0800
Subject: [PATCH 281/313] [clang] Accept empty enum in MSVC compatible C
 (#159981)

Fixes https://github.com/llvm/llvm-project/issues/114402.
This patch accept empty enum in C as a microsoft extension and introduce
an new warning `-Wmicrosoft-empty-enum`.

---------

Signed-off-by: yicuixi <qin_17914@126.com>
Co-authored-by: Erich Keane <ekeane@nvidia.com>
Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/docs/LanguageExtensions.rst                 | 10 ++++++++++
 clang/docs/ReleaseNotes.rst                       |  1 +
 clang/include/clang/Basic/DiagnosticGroups.td     |  4 +++-
 clang/include/clang/Basic/DiagnosticParseKinds.td |  3 +++
 clang/lib/Parse/ParseDecl.cpp                     |  9 +++++++--
 clang/test/CodeGen/ms-empty-enum.c                |  7 +++++++
 clang/test/Parser/ms-empty-enum.c                 |  6 ++++++
 7 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/ms-empty-enum.c
 create mode 100644 clang/test/Parser/ms-empty-enum.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index baa0bbb5ea631..b7c18c2b5c6cf 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2409,6 +2409,16 @@ those modes.
 Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed
 underlying types is available in C23 and later.
 
+Enumerations with no enumerators
+--------------------------------
+
+Clang provides support for Microsoft extensions to support enumerations with no enumerators.
+
+.. code-block:: c++
+
+  typedef enum empty { } A;
+
+
 Interoperability with C++11 lambdas
 -----------------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3f57ddc92d5e8..32f669f8d70d8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -444,6 +444,7 @@ Bug Fixes in This Version
 - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898)
 - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951)
 - Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953)
+- Accept empty enumerations in MSVC-compatible C mode. (#GH114402)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 8aa3489a2a62b..1e0321de3f4b6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1436,6 +1436,7 @@ def MicrosoftDrectveSection : DiagGroup<"microsoft-drectve-section">;
 def MicrosoftInclude : DiagGroup<"microsoft-include">;
 def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">;
 def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">;
+def MicrosoftEmptyEnum : DiagGroup<"microsoft-empty-enum">;
 def MicrosoftSealed : DiagGroup<"microsoft-sealed">;
 def MicrosoftAbstract : DiagGroup<"microsoft-abstract">;
 def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">;
@@ -1489,7 +1490,8 @@ def Microsoft : DiagGroup<"microsoft",
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
      MicrosoftCommentPaste, MicrosoftEndOfFile,
      MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined,
-     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>;
+     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction,
+     MicrosoftEmptyEnum]>;
 
 def ClangClPch : DiagGroup<"clang-cl-pch">;
 
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index e5e071f43fa75..aa0ccb0c05101 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -116,6 +116,9 @@ def err_enumerator_unnamed_no_def : Error<
 def ext_ms_c_enum_fixed_underlying_type : Extension<
   "enumeration types with a fixed underlying type are a Microsoft extension">,
   InGroup<MicrosoftFixedEnum>;
+def ext_ms_c_empty_enum_type : Extension<
+  "empty enumeration types are a Microsoft extension">,
+  InGroup<MicrosoftEmptyEnum>;
 def ext_c23_enum_fixed_underlying_type : Extension<
   "enumeration types with a fixed underlying type are a C23 extension">,
   InGroup<C23>;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 7e4a164e34eda..5fcb659768655 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -5370,8 +5370,13 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl,
   T.consumeOpen();
 
   // C does not allow an empty enumerator-list, C++ does [dcl.enum].
-  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus)
-    Diag(Tok, diag::err_empty_enum);
+  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) {
+    if (getLangOpts().MicrosoftExt)
+      Diag(T.getOpenLocation(), diag::ext_ms_c_empty_enum_type)
+          << SourceRange(T.getOpenLocation(), Tok.getLocation());
+    else
+      Diag(Tok, diag::err_empty_enum);
+  }
 
   SmallVector<Decl *, 32> EnumConstantDecls;
   SmallVector<SuppressAccessChecks, 32> EnumAvailabilityDiags;
diff --git a/clang/test/CodeGen/ms-empty-enum.c b/clang/test/CodeGen/ms-empty-enum.c
new file mode 100644
index 0000000000000..6c1c87b756f9a
--- /dev/null
+++ b/clang/test/CodeGen/ms-empty-enum.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -triple i386-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
+
+typedef enum tag1 {} A;
+
+// CHECK: void @empty_enum(i32 noundef %a)
+void empty_enum(A a) {}
diff --git a/clang/test/Parser/ms-empty-enum.c b/clang/test/Parser/ms-empty-enum.c
new file mode 100644
index 0000000000000..790547af88bab
--- /dev/null
+++ b/clang/test/Parser/ms-empty-enum.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions
+
+typedef enum tag1 { } A; // expected-warning {{empty enumeration types are a Microsoft extension}}
+typedef enum tag2 { } B; // expected-warning {{empty enumeration types are a Microsoft extension}}
+typedef enum : unsigned { } C; // expected-warning {{enumeration types with a fixed underlying type are a Microsoft extension}}\
+                               // expected-warning {{empty enumeration types are a Microsoft extension}}

From 304d2ff4d998abac779b11afeaede2146c2c60a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 4 Nov 2025 22:56:19 -0800
Subject: [PATCH 282/313] CodeGen: Record MMOs in finalizeBundle (#166210)

This allows more accurate alias analysis to apply at the bundle level.
This has a bunch of minor effects in post-RA scheduling that look mostly
beneficial to me, all of them in AMDGPU (the Thumb2 change is cosmetic).

The pre-existing (and unchanged) test in
CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll tests that MIR with a
bundle with MMOs can be parsed successfully.

v2:
- use cloneMergedMemRefs
- add another test to explicitly check the MMO bundling behavior

v3:
- use poison instead of undef to initialize the global variable in the
test
---
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       |    2 +
 llvm/lib/CodeGen/MachineInstrBundle.cpp       |    6 +
 .../GlobalISel/insertelement-stack-lower.ll   |    2 +-
 .../AMDGPU/GlobalISel/store-local.128.ll      |   20 +-
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   |    7 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 9954 ++++++++---------
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |  363 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   | 1503 ++-
 .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll   |  295 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll   |  357 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll   |  370 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   |  534 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   |  839 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   |  756 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll   | 1361 ++-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   28 +-
 .../AMDGPU/buffer-fat-pointers-memcpy.ll      |   49 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |   12 +-
 llvm/test/CodeGen/AMDGPU/ds_write2.ll         |    4 +-
 llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll     |    4 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |   63 +-
 llvm/test/CodeGen/AMDGPU/finalizebundle.mir   |   52 +
 .../AMDGPU/gfx-callable-return-types.ll       |  153 +-
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  |    4 +-
 .../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir |    4 +-
 .../CodeGen/AMDGPU/hard-clauses-img-gfx12.mir |    4 +-
 ...llvm.amdgcn.ds.gws.barrier-fastregalloc.ll |   21 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |    2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll     |    4 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll       |    8 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       |  156 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |   18 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |   10 +-
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    |   15 +-
 llvm/test/CodeGen/AMDGPU/load-local-i16.ll    |  214 +-
 ...er-buffer-fat-pointers-lastuse-metadata.ll |   31 +-
 ...uffer-fat-pointers-nontemporal-metadata.ll |   58 +-
 .../AMDGPU/lower-lds-struct-aa-memcpy.ll      |    4 +-
 .../CodeGen/AMDGPU/lower-lds-struct-aa.ll     |    8 +-
 llvm/test/CodeGen/AMDGPU/max.ll               |    2 +-
 .../CodeGen/AMDGPU/memintrinsic-unroll.ll     | 2085 ++--
 llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll  |    6 +-
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  203 +-
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   |    5 +-
 .../postra-bundle-vimage-vsample-gfx12.mir    |    4 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |   41 +-
 llvm/test/CodeGen/AMDGPU/scratch-simple.ll    | 2370 ++--
 .../soft-clause-exceeds-register-budget.ll    |   15 +-
 llvm/test/CodeGen/AMDGPU/spill-agpr.ll        |    6 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   |   22 +-
 llvm/test/CodeGen/AMDGPU/stack-realign.ll     |    2 +-
 .../Thumb2/mve-vpt-block-fold-vcmp.mir        |   45 +-
 53 files changed, 10762 insertions(+), 11345 deletions(-)

diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 4795d81e3f348..434a579c3be3f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,6 +1161,8 @@ bool MIParser::parse(MachineInstr *&MI) {
       MemOperands.push_back(MemOp);
       if (Token.isNewlineOrEOF())
         break;
+      if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
+        break;
       if (Token.isNot(MIToken::comma))
         return error("expected ',' before the next machine memory operand");
       lex();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 88d81993fbe55..a8dc614288f20 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -137,6 +137,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
   SmallVector<std::pair<Register, Register>> TiedOperands;
+  SmallVector<MachineInstr *> MemMIs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -200,6 +201,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       MIB.setMIFlag(MachineInstr::FrameSetup);
     if (MII->getFlag(MachineInstr::FrameDestroy))
       MIB.setMIFlag(MachineInstr::FrameDestroy);
+
+    if (MII->mayLoadOrStore())
+      MemMIs.push_back(&*MII);
   }
 
   for (Register Reg : LocalDefs) {
@@ -225,6 +229,8 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     assert(UseIdx < ExternUses.size());
     MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
+
+  MIB->cloneMergedMemRefs(MF, MemMIs);
 }
 
 /// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c2129c20e4543..6076a2eec44bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -33,7 +33,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v13, s49
 ; GCN-NEXT:    v_mov_b32_e32 v14, s50
 ; GCN-NEXT:    v_mov_b32_e32 v15, s51
-; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
@@ -51,6 +50,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:56
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:60
 ; GCN-NEXT:    v_mov_b32_e32 v0, s52
+; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
 ; GCN-NEXT:    v_mov_b32_e32 v0, s53
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 1812e17800e71..10e83b70a57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,15 +189,11 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s6
-; GFX10-NEXT:    v_mov_b32_e32 v7, s1
-; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX10-NEXT:    ds_write_b8 v1, v0
@@ -208,18 +204,22 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    ds_write_b8 v1, v8 offset:1
 ; GFX10-NEXT:    ds_write_b8 v1, v9 offset:5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v10, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v10, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s3
+; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index b33b8a7d8cd72..4a22a911c60b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,10 +272,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -288,6 +284,9 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 74552a500ac51..746ffcff5667a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,22 +3105,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -3253,6 +3237,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -3284,14 +3284,13 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB12_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -3523,7 +3522,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB12_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -3946,8 +3944,24 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -4295,44 +4309,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v32i32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -4437,6 +4419,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4542,129 +4540,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -5113,9 +5111,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5255,15 +5254,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
 ; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -5280,12 +5272,23 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v32i32_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -5302,9 +5305,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -5437,7 +5437,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -5493,7 +5492,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -5508,7 +5507,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -5520,149 +5519,147 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -5670,7 +5667,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -5698,7 +5697,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v32, 3, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_add_u32_e32 v31, 3, v31
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; GFX9-NEXT:    v_add_u32_e32 v30, 3, v30
@@ -6006,9 +6005,25 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -6280,22 +6295,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6755,7 +6754,11 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -6776,10 +6779,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -7416,7 +7415,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -10666,7 +10665,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
@@ -11599,7 +11598,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-NEXT:    v_readlane_b32 s34, v16, 2
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v16, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:8
@@ -11812,13 +11811,26 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -11979,44 +11991,30 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB14_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -12025,11 +12023,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -12632,7 +12630,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB14_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -12646,8 +12643,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -13327,13 +13324,25 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -13470,34 +13479,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -13983,7 +13978,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -14561,13 +14555,27 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -14709,34 +14717,20 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -15223,7 +15217,6 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -16362,7 +16355,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -16395,7 +16388,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -17336,7 +17329,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB14_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -17369,7 +17362,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -18086,24 +18079,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB15_3
 ; SI-NEXT:  .LBB15_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -18114,10 +18096,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB15_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -18127,7 +18121,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -18722,13 +18715,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -18956,11 +18949,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -18970,11 +18963,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -18982,6 +18972,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19190,12 +19182,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB15_3
 ; VI-NEXT:  .LBB15_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -19213,6 +19199,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -19222,7 +19214,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -19235,7 +19227,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -19820,8 +19811,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -20000,16 +19991,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20036,9 +20029,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -20054,14 +20046,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -20073,10 +20067,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20089,10 +20084,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -20106,17 +20103,22 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20132,45 +20134,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20221,18 +20202,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB15_3
 ; GFX9-NEXT:  .LBB15_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -20246,6 +20215,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -20683,7 +20664,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -20716,7 +20697,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -21573,7 +21554,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -21606,7 +21587,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -21624,7 +21605,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -21657,7 +21638,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -22514,7 +22495,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -22547,7 +22528,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -23110,10 +23091,25 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -23292,22 +23288,6 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -26129,7 +26109,10 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -26146,9 +26129,6 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -26714,7 +26694,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -29181,7 +29161,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -29214,7 +29194,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -29247,7 +29227,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -30049,7 +30029,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -30082,7 +30062,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -30115,7 +30095,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -30155,7 +30135,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -30188,7 +30168,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -30221,7 +30201,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -30913,7 +30893,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -30946,7 +30926,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -30979,7 +30959,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -31788,6 +31768,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -31807,22 +31803,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -32493,22 +32473,6 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -32524,6 +32488,22 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34732,7 +34712,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -34765,7 +34745,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -34798,7 +34778,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -34876,7 +34856,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -34909,7 +34889,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -34942,7 +34922,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -35000,6 +34980,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -35016,10 +35000,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -35051,14 +35031,13 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB24_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -35103,7 +35082,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB24_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
@@ -35330,6 +35308,22 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -35356,7 +35350,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -35369,22 +35363,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -36338,7 +36316,13 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -36370,12 +36354,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -36391,7 +36369,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36608,7 +36585,6 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -37782,7 +37758,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -37815,7 +37791,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -37848,7 +37824,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -37926,7 +37902,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -37959,7 +37935,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -37992,7 +37968,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -40033,22 +40009,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -40181,6 +40141,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -40212,14 +40188,13 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB36_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -40451,7 +40426,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB36_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -40874,8 +40848,24 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -41223,44 +41213,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v32f32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -41365,6 +41323,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -41470,129 +41444,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -42041,9 +42015,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -42183,15 +42158,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
 ; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -42208,12 +42176,23 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v32f32_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -42230,9 +42209,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -42365,7 +42341,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -42421,7 +42396,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -42436,7 +42411,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -42448,149 +42423,147 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -42598,7 +42571,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -42626,7 +42601,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_f32_e32 v32, 1.0, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; GFX9-NEXT:    v_add_f32_e32 v30, 1.0, v30
@@ -42934,9 +42909,25 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -43208,22 +43199,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -43666,7 +43641,11 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -43687,10 +43666,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -44310,7 +44285,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -44770,27 +44745,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 24
-; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
-; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
@@ -44842,24 +44801,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v12
+; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v21
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v21
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v26
@@ -44868,6 +44836,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v26
 ; SI-NEXT:    v_add_f32_e64 v41, s21, 1.0
 ; SI-NEXT:    v_add_f32_e64 v40, s20, 1.0
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v26
@@ -44875,6 +44845,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_add_f32_e64 v57, s16, 1.0
 ; SI-NEXT:    v_add_f32_e64 v46, s19, 1.0
 ; SI-NEXT:    v_add_f32_e64 v45, s18, 1.0
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[40:41], 16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -44885,6 +44856,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v30
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshr_b64 v[27:28], v[40:41], 24
 ; SI-NEXT:    v_lshr_b64 v[33:34], v[45:46], 24
 ; SI-NEXT:    v_lshr_b64 v[38:39], v[45:46], 8
@@ -45408,33 +45381,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v13, s98
+; SI-NEXT:    v_mov_b32_e32 v27, s62
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v13, s46
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v13, s56
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v13, s58
-; SI-NEXT:    v_mov_b32_e32 v27, s62
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v13, s46
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s72
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v13, s56
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s74
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v13, s58
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s76
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v57, s16
@@ -45468,6 +45441,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v3, s6
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v13, s60
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s78
@@ -45694,9 +45668,22 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v29
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
@@ -45809,17 +45796,16 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v15
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
@@ -46070,19 +46056,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -46687,6 +46660,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    ; implicit-def: $sgpr46
 ; VI-NEXT:    s_branch .LBB37_2
 ; VI-NEXT:  .LBB37_4:
+; VI-NEXT:    v_mov_b32_e32 v53, s46
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s56
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 0
 ; VI-NEXT:    v_mov_b32_e32 v48, s4
@@ -46764,6 +46741,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s58
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 27
@@ -46841,6 +46821,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 51
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s60
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
@@ -46859,40 +46842,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
-; VI-NEXT:    v_mov_b32_e32 v53, s46
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s56
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s58
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s60
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s62
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s72
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s74
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s76
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s78
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s88
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s90
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, s16
 ; VI-NEXT:    v_mov_b32_e32 v32, s17
 ; VI-NEXT:    v_mov_b32_e32 v29, s18
@@ -46946,11 +46895,35 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v42, s82
 ; VI-NEXT:    v_mov_b32_e32 v37, s81
 ; VI-NEXT:    v_mov_b32_e32 v50, s80
-; VI-NEXT:    v_mov_b32_e32 v53, s30
-; VI-NEXT:    v_mov_b32_e32 v54, s34
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s36
 ; VI-NEXT:    v_mov_b32_e32 v40, s38
 ; VI-NEXT:    v_mov_b32_e32 v41, s48
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s62
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s72
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s74
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s76
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s78
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s88
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s90
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s30
+; VI-NEXT:    v_mov_b32_e32 v54, s34
 ; VI-NEXT:  .LBB37_5: ; %end
 ; VI-NEXT:    v_lshlrev_b32_e32 v34, 8, v34
 ; VI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
@@ -47018,6 +46991,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v23, 8, v50
 ; VI-NEXT:    v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v36
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    v_readlane_b32 s87, v63, 31
 ; VI-NEXT:    v_readlane_b32 s86, v63, 30
 ; VI-NEXT:    v_readlane_b32 s85, v63, 29
@@ -47050,7 +47037,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 36, v0
@@ -47341,20 +47328,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
@@ -48123,10 +48096,8 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 49
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s4
-; GFX9-NEXT:    v_mov_b32_e32 v49, s52
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s46
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -48175,6 +48146,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s94
+; GFX9-NEXT:    v_mov_b32_e32 v49, s52
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -48222,6 +48194,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v54, s55
 ; GFX9-NEXT:    v_mov_b32_e32 v50, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s54
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v49, s51
 ; GFX9-NEXT:    v_mov_b32_e32 v59, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v58, s49
@@ -48291,6 +48264,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v49
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
 ; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
 ; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
@@ -48327,7 +48314,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
@@ -48621,20 +48608,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
@@ -48646,7 +48619,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -48681,7 +48654,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -49601,7 +49574,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -49663,7 +49636,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -49876,13 +49849,26 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -50043,44 +50029,30 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB38_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -50089,11 +50061,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -50696,7 +50668,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB38_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -50710,8 +50681,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -51391,13 +51362,25 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -51534,34 +51517,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -52047,7 +52016,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -52625,13 +52593,27 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -52773,34 +52755,20 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -53287,7 +53255,6 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -54426,7 +54393,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -54459,7 +54426,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -55400,7 +55367,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB38_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -55433,7 +55400,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -56150,24 +56117,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB39_3
 ; SI-NEXT:  .LBB39_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -56178,10 +56134,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB39_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -56191,7 +56159,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -56786,13 +56753,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -57020,11 +56987,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -57034,11 +57001,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -57046,6 +57010,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -57254,12 +57220,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB39_3
 ; VI-NEXT:  .LBB39_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -57277,6 +57237,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -57286,7 +57252,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -57299,7 +57265,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -57884,8 +57849,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -58064,16 +58029,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58100,9 +58067,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -58118,14 +58084,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -58137,10 +58105,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58153,10 +58122,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -58170,17 +58141,22 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58196,45 +58172,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58285,18 +58240,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB39_3
 ; GFX9-NEXT:  .LBB39_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -58310,6 +58253,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -58747,7 +58702,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -58780,7 +58735,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -59637,7 +59592,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -59670,7 +59625,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -59688,7 +59643,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -59721,7 +59676,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -60578,7 +60533,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -60611,7 +60566,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -61174,10 +61129,25 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -61356,22 +61326,6 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -62140,6 +62094,20 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_readlane_b32 s98, v63, 34
 ; SI-NEXT:    v_readlane_b32 s97, v63, 33
@@ -62176,10 +62144,9 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_readlane_b32 s34, v63, 2
 ; SI-NEXT:    v_readlane_b32 s31, v63, 1
 ; SI-NEXT:    v_readlane_b32 s30, v63, 0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
@@ -62193,20 +62160,6 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -64239,7 +64192,10 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -64256,9 +64212,6 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -64824,7 +64777,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -67291,7 +67244,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -67324,7 +67277,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -67357,7 +67310,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -68159,7 +68112,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -68192,7 +68145,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -68225,7 +68178,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -68265,7 +68218,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -68298,7 +68251,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -68331,7 +68284,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -69023,7 +68976,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -69056,7 +69009,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -69089,7 +69042,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -69898,6 +69851,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -69917,22 +69886,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -70603,11 +70556,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -70624,6 +70572,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -72813,7 +72766,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -72846,7 +72799,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -72879,7 +72832,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -72957,7 +72910,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -72990,7 +72943,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -73023,7 +72976,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -73081,6 +73034,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f32_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -73097,10 +73054,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -73132,14 +73085,13 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB48_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -73184,7 +73136,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB48_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
@@ -73411,6 +73362,22 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -73437,7 +73404,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -73450,22 +73417,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -73951,9 +73902,25 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
@@ -73979,22 +73946,6 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -74373,7 +74324,13 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -74405,12 +74362,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -74426,7 +74377,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -74643,7 +74593,6 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -75817,7 +75766,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -75850,7 +75799,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -75883,7 +75832,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -75961,7 +75910,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -75994,7 +75943,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -76027,7 +75976,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -77054,22 +77003,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16i64_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -77202,6 +77135,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -77233,14 +77182,13 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB56_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -77501,7 +77449,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_addc_u32_e32 v31, vcc, 0, v31, vcc
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
@@ -77895,8 +77842,24 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
@@ -78244,44 +78207,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v16i64_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -78386,6 +78317,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -78491,129 +78438,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -79062,9 +79009,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -79204,15 +79152,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
 ; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -79229,12 +79170,23 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v16i64_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -79251,9 +79203,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -79386,7 +79335,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -79442,7 +79390,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(45)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -79457,7 +79405,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -79469,149 +79417,147 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -79619,7 +79565,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -79676,7 +79624,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, 0, v28, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v29, vcc, 3, v29
 ; GFX9-NEXT:    v_addc_co_u32_e32 v30, vcc, 0, v30, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_add_co_u32_e32 v31, vcc, 3, v31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v32, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -79955,9 +79903,25 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -80229,22 +80193,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -80712,7 +80660,11 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -80733,10 +80685,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -81381,7 +81329,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -84631,7 +84579,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
@@ -85566,7 +85514,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-NEXT:    v_readlane_b32 s34, v16, 2
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v16, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:8
@@ -85779,13 +85727,26 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -85946,44 +85907,30 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -85992,11 +85939,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -86599,7 +86546,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -86613,8 +86559,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -87294,13 +87240,25 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -87437,34 +87395,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -87950,7 +87894,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -88528,13 +88471,27 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -88676,34 +88633,20 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -89190,7 +89133,6 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -90329,7 +90271,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -90362,7 +90304,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -91303,7 +91245,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB58_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -91336,7 +91278,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -92053,24 +91995,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB59_3
 ; SI-NEXT:  .LBB59_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -92081,10 +92012,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB59_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -92094,7 +92037,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -92689,13 +92631,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -92923,11 +92865,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -92937,11 +92879,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -92949,6 +92888,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -93157,12 +93098,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB59_3
 ; VI-NEXT:  .LBB59_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -93180,6 +93115,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -93189,7 +93130,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -93202,7 +93143,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -93787,8 +93727,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -93967,16 +93907,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -94003,9 +93945,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -94021,14 +93962,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -94040,10 +93983,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94056,10 +94000,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -94073,17 +94019,22 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -94099,45 +94050,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94188,18 +94118,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB59_3
 ; GFX9-NEXT:  .LBB59_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -94213,6 +94131,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -94650,7 +94580,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -94683,7 +94613,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -95540,7 +95470,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -95573,7 +95503,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -95591,7 +95521,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -95624,7 +95554,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -96481,7 +96411,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -96514,7 +96444,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -97078,10 +97008,25 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -97260,22 +97205,6 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -100084,7 +100013,10 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -100101,9 +100033,6 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -100669,7 +100598,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -103136,7 +103065,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -103169,7 +103098,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -103202,7 +103131,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -104004,7 +103933,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -104037,7 +103966,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -104070,7 +103999,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -104110,7 +104039,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -104143,7 +104072,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -104176,7 +104105,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -104868,7 +104797,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -104901,7 +104830,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -104934,7 +104863,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -105740,6 +105669,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -105759,22 +105704,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -106453,22 +106382,6 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -106484,6 +106397,22 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -108700,7 +108629,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -108733,7 +108662,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -108766,7 +108695,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -108844,7 +108773,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -108877,7 +108806,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -108910,7 +108839,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -108968,6 +108897,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16i64_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -108984,10 +108917,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -109019,14 +108948,13 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB68_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -109099,7 +109027,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_addc_u32_e32 v31, vcc, 0, v31, vcc
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
@@ -109296,6 +109223,22 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -109322,7 +109265,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -109335,22 +109278,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -110320,7 +110247,13 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -110352,12 +110285,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -110373,7 +110300,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -110590,7 +110516,6 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -111764,7 +111689,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -111797,7 +111722,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -111830,7 +111755,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -111908,7 +111833,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -111941,7 +111866,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -111974,7 +111899,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -112032,22 +111957,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -112180,6 +112089,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr42
@@ -112211,14 +112136,13 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB72_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -112449,7 +112373,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB72_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 24
@@ -112827,8 +112750,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v11
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
@@ -113206,44 +113145,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v16f64_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -113346,6 +113253,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -113448,132 +113371,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 24, v28
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
@@ -114009,9 +113932,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -114151,17 +114075,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v42
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -114178,12 +114092,26 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v16f64_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -114200,9 +114128,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
@@ -114335,7 +114260,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114395,7 +114319,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    s_waitcnt vmcnt(47)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -114408,7 +114332,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    s_waitcnt vmcnt(49)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -114416,152 +114340,151 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[5:6]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114571,6 +114494,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v11
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 8, v9
@@ -114599,7 +114523,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB72_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    s_waitcnt vmcnt(46)
 ; GFX9-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; GFX9-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; GFX9-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
@@ -114904,8 +114828,24 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -115170,22 +115110,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -115628,7 +115552,11 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -115649,10 +115577,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -116272,7 +116196,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -117056,6 +116980,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v33, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 39
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
+; SI-NEXT:    v_mov_b32_e32 v29, s46
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s98
 ; SI-NEXT:    v_readlane_b32 s4, v61, 40
 ; SI-NEXT:    v_mov_b32_e32 v34, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 41
@@ -117148,6 +117077,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s96
 ; SI-NEXT:    v_readlane_b32 s4, v62, 0
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -117204,20 +117137,69 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v25, s4
-; SI-NEXT:    v_mov_b32_e32 v29, s46
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v29, s98
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v29, s96
+; SI-NEXT:    v_readlane_b32 s4, v62, 14
+; SI-NEXT:    v_mov_b32_e32 v60, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 15
+; SI-NEXT:    v_mov_b32_e32 v31, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 16
+; SI-NEXT:    v_mov_b32_e32 v32, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 17
+; SI-NEXT:    v_mov_b32_e32 v18, s5
+; SI-NEXT:    v_mov_b32_e32 v46, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 0
+; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    v_mov_b32_e32 v59, s17
+; SI-NEXT:    v_mov_b32_e32 v58, s16
+; SI-NEXT:    v_mov_b32_e32 v45, s19
+; SI-NEXT:    v_mov_b32_e32 v44, s18
+; SI-NEXT:    v_mov_b32_e32 v53, s21
+; SI-NEXT:    v_mov_b32_e32 v52, s20
+; SI-NEXT:    v_mov_b32_e32 v39, s23
+; SI-NEXT:    v_mov_b32_e32 v38, s22
+; SI-NEXT:    v_mov_b32_e32 v24, s25
+; SI-NEXT:    v_mov_b32_e32 v23, s24
+; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    v_mov_b32_e32 v21, s26
+; SI-NEXT:    v_mov_b32_e32 v20, s29
+; SI-NEXT:    v_mov_b32_e32 v19, s28
+; SI-NEXT:    v_mov_b32_e32 v16, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v14, s9
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s86
+; SI-NEXT:    v_mov_b32_e32 v13, s8
+; SI-NEXT:    v_mov_b32_e32 v12, s11
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v10, s13
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v8, s15
+; SI-NEXT:    v_mov_b32_e32 v7, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s41
+; SI-NEXT:    v_mov_b32_e32 v5, s40
+; SI-NEXT:    v_mov_b32_e32 v4, s43
+; SI-NEXT:    v_mov_b32_e32 v3, s42
+; SI-NEXT:    v_mov_b32_e32 v2, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v28, s38
+; SI-NEXT:    v_mov_b32_e32 v27, s36
+; SI-NEXT:    v_mov_b32_e32 v26, s34
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v25, s30
+; SI-NEXT:    v_mov_b32_e32 v56, s94
+; SI-NEXT:    v_mov_b32_e32 v55, s92
+; SI-NEXT:    v_mov_b32_e32 v54, s90
+; SI-NEXT:    v_mov_b32_e32 v42, s88
+; SI-NEXT:    v_mov_b32_e32 v41, s78
+; SI-NEXT:    v_mov_b32_e32 v40, s76
+; SI-NEXT:    v_mov_b32_e32 v50, s74
+; SI-NEXT:    v_mov_b32_e32 v49, s72
+; SI-NEXT:    v_mov_b32_e32 v48, s62
+; SI-NEXT:    v_mov_b32_e32 v47, s60
+; SI-NEXT:    v_mov_b32_e32 v36, s58
+; SI-NEXT:    v_mov_b32_e32 v35, s56
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
@@ -117260,165 +117242,108 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s50
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s4, v62, 14
-; SI-NEXT:    v_mov_b32_e32 v60, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 15
-; SI-NEXT:    v_mov_b32_e32 v31, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 16
-; SI-NEXT:    v_mov_b32_e32 v32, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 17
-; SI-NEXT:    v_mov_b32_e32 v18, s5
-; SI-NEXT:    v_mov_b32_e32 v46, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 0
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 2
+; SI-NEXT:    v_readlane_b32 s5, v61, 3
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 1
-; SI-NEXT:    v_readlane_b32 s4, v61, 2
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 4
+; SI-NEXT:    v_readlane_b32 s5, v61, 5
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 3
-; SI-NEXT:    v_readlane_b32 s4, v61, 4
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 6
+; SI-NEXT:    v_readlane_b32 s5, v61, 7
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 5
-; SI-NEXT:    v_readlane_b32 s4, v61, 6
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 8
+; SI-NEXT:    v_readlane_b32 s5, v61, 9
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 7
-; SI-NEXT:    v_readlane_b32 s4, v61, 8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 10
+; SI-NEXT:    v_readlane_b32 s5, v61, 11
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 9
-; SI-NEXT:    v_readlane_b32 s4, v61, 10
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 12
+; SI-NEXT:    v_readlane_b32 s5, v61, 13
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 11
-; SI-NEXT:    v_readlane_b32 s4, v61, 12
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 14
+; SI-NEXT:    v_readlane_b32 s5, v61, 15
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 13
-; SI-NEXT:    v_readlane_b32 s4, v61, 14
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 16
+; SI-NEXT:    v_readlane_b32 s5, v61, 17
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 15
-; SI-NEXT:    v_readlane_b32 s4, v61, 16
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 18
+; SI-NEXT:    v_readlane_b32 s5, v61, 19
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 17
-; SI-NEXT:    v_readlane_b32 s4, v61, 18
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 20
+; SI-NEXT:    v_readlane_b32 s5, v61, 21
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 19
-; SI-NEXT:    v_readlane_b32 s4, v61, 20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 22
+; SI-NEXT:    v_readlane_b32 s5, v61, 23
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 21
-; SI-NEXT:    v_readlane_b32 s4, v61, 22
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 24
+; SI-NEXT:    v_readlane_b32 s5, v61, 25
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 23
-; SI-NEXT:    v_readlane_b32 s4, v61, 24
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 26
+; SI-NEXT:    v_readlane_b32 s5, v61, 27
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 25
-; SI-NEXT:    v_readlane_b32 s4, v61, 26
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 28
+; SI-NEXT:    v_readlane_b32 s5, v61, 29
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 27
-; SI-NEXT:    v_readlane_b32 s4, v61, 28
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 30
+; SI-NEXT:    v_readlane_b32 s5, v61, 31
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 29
-; SI-NEXT:    v_readlane_b32 s4, v61, 30
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 32
+; SI-NEXT:    v_readlane_b32 s5, v61, 33
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s48
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    v_readlane_b32 s5, v61, 31
-; SI-NEXT:    v_readlane_b32 s4, v61, 32
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_mov_b32_e32 v59, s17
-; SI-NEXT:    v_mov_b32_e32 v58, s16
-; SI-NEXT:    v_mov_b32_e32 v45, s19
-; SI-NEXT:    v_mov_b32_e32 v44, s18
-; SI-NEXT:    v_mov_b32_e32 v53, s21
-; SI-NEXT:    v_mov_b32_e32 v52, s20
-; SI-NEXT:    v_mov_b32_e32 v39, s23
-; SI-NEXT:    v_mov_b32_e32 v38, s22
-; SI-NEXT:    v_mov_b32_e32 v24, s25
-; SI-NEXT:    v_mov_b32_e32 v23, s24
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v21, s26
-; SI-NEXT:    v_mov_b32_e32 v20, s29
-; SI-NEXT:    v_mov_b32_e32 v19, s28
-; SI-NEXT:    v_mov_b32_e32 v16, s7
-; SI-NEXT:    v_mov_b32_e32 v15, s6
-; SI-NEXT:    v_mov_b32_e32 v14, s9
-; SI-NEXT:    v_mov_b32_e32 v13, s8
-; SI-NEXT:    v_mov_b32_e32 v12, s11
-; SI-NEXT:    v_mov_b32_e32 v11, s10
-; SI-NEXT:    v_mov_b32_e32 v10, s13
-; SI-NEXT:    v_mov_b32_e32 v9, s12
-; SI-NEXT:    v_mov_b32_e32 v8, s15
-; SI-NEXT:    v_mov_b32_e32 v7, s14
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v5, s40
-; SI-NEXT:    v_mov_b32_e32 v4, s43
-; SI-NEXT:    v_mov_b32_e32 v3, s42
-; SI-NEXT:    v_mov_b32_e32 v2, s45
-; SI-NEXT:    v_mov_b32_e32 v1, s44
-; SI-NEXT:    v_mov_b32_e32 v28, s38
-; SI-NEXT:    v_mov_b32_e32 v27, s36
-; SI-NEXT:    v_mov_b32_e32 v26, s34
-; SI-NEXT:    v_mov_b32_e32 v25, s30
-; SI-NEXT:    v_mov_b32_e32 v56, s94
-; SI-NEXT:    v_mov_b32_e32 v55, s92
-; SI-NEXT:    v_mov_b32_e32 v54, s90
-; SI-NEXT:    v_mov_b32_e32 v42, s88
-; SI-NEXT:    v_mov_b32_e32 v41, s78
-; SI-NEXT:    v_mov_b32_e32 v40, s76
-; SI-NEXT:    v_mov_b32_e32 v50, s74
-; SI-NEXT:    v_mov_b32_e32 v49, s72
-; SI-NEXT:    v_mov_b32_e32 v48, s62
-; SI-NEXT:    v_mov_b32_e32 v47, s60
-; SI-NEXT:    v_mov_b32_e32 v36, s58
-; SI-NEXT:    v_mov_b32_e32 v35, s56
-; SI-NEXT:    v_readlane_b32 s5, v61, 33
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB73_5: ; %end
@@ -117711,9 +117636,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
@@ -118002,15 +117927,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v43
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v37
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -118024,6 +117940,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v37
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -118690,6 +118615,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 11
 ; VI-NEXT:    v_mov_b32_e32 v41, s4
+; VI-NEXT:    v_mov_b32_e32 v40, s48
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s38
 ; VI-NEXT:    v_readlane_b32 s4, v62, 12
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
@@ -118727,6 +118656,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 25
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s36
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -118764,6 +118696,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s34
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
@@ -118779,52 +118714,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 42
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
-; VI-NEXT:    v_mov_b32_e32 v40, s48
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s38
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s36
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s34
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s30
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s90
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s88
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s78
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s76
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s74
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s72
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s62
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s60
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s58
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s56
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_readlane_b32 s4, v62, 43
 ; VI-NEXT:    v_mov_b32_e32 v53, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 44
@@ -118834,6 +118723,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 46
 ; VI-NEXT:    v_mov_b32_e32 v58, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 47
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 48
 ; VI-NEXT:    v_mov_b32_e32 v54, s4
@@ -118846,17 +118736,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 53
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s30
 ; VI-NEXT:    v_mov_b32_e32 v49, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 54
 ; VI-NEXT:    v_mov_b32_e32 v61, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 55
 ; VI-NEXT:    v_mov_b32_e32 v36, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 56
-; VI-NEXT:    v_mov_b32_e32 v40, s46
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v12, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
 ; VI-NEXT:    v_mov_b32_e32 v2, s45
@@ -118886,13 +118776,48 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v28, s21
 ; VI-NEXT:    v_mov_b32_e32 v29, s18
 ; VI-NEXT:    v_mov_b32_e32 v30, s19
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s90
 ; VI-NEXT:    v_mov_b32_e32 v31, s16
 ; VI-NEXT:    v_mov_b32_e32 v32, s17
 ; VI-NEXT:    v_mov_b32_e32 v42, s70
 ; VI-NEXT:    v_mov_b32_e32 v50, s4
-; VI-NEXT:    v_mov_b32_e32 v40, v43
 ; VI-NEXT:    v_mov_b32_e32 v46, v38
 ; VI-NEXT:    v_mov_b32_e32 v38, v34
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s88
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s78
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s76
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s74
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s72
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s62
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s60
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s58
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s56
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s46
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, v43
 ; VI-NEXT:  .LBB73_5: ; %end
 ; VI-NEXT:    v_lshlrev_b32_e32 v34, 8, v42
 ; VI-NEXT:    v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -119291,10 +119216,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -119309,6 +119231,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -119906,6 +119831,12 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $sgpr46
 ; GFX9-NEXT:    s_branch .LBB73_2
 ; GFX9-NEXT:  .LBB73_4:
+; GFX9-NEXT:    v_mov_b32_e32 v41, s66
+; GFX9-NEXT:    v_mov_b32_e32 v40, s36
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s34
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s81
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s71
@@ -119982,6 +119913,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 9
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s30
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 10
@@ -120040,71 +119975,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 28
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 29
-; GFX9-NEXT:    v_mov_b32_e32 v41, s66
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
-; GFX9-NEXT:    v_mov_b32_e32 v40, s36
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s34
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s30
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s94
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s92
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s90
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s88
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s78
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s76
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s74
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s72
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s62
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s60
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s58
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s56
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 30
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v44, s4
@@ -120119,6 +119993,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 36
 ; GFX9-NEXT:    v_mov_b32_e32 v55, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 37
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s94
 ; GFX9-NEXT:    v_mov_b32_e32 v61, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 38
 ; GFX9-NEXT:    v_mov_b32_e32 v42, s4
@@ -120143,7 +120021,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 48
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 49
-; GFX9-NEXT:    v_mov_b32_e32 v40, s46
 ; GFX9-NEXT:    v_mov_b32_e32 v12, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s44
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s45
@@ -120181,6 +120058,54 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v54, s64
 ; GFX9-NEXT:    v_mov_b32_e32 v52, s54
 ; GFX9-NEXT:    v_mov_b32_e32 v25, s4
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s92
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s90
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s88
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s78
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s76
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s74
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s72
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s62
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s60
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s58
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s56
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s46
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -120202,6 +120127,8 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v51, 8, v45
 ; GFX9-NEXT:    v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v40, 8, v56
 ; GFX9-NEXT:    v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
@@ -120252,46 +120179,45 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v44
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v30
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v51
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v59
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v49
 ; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v48
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -120305,9 +120231,11 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
@@ -120319,9 +120247,23 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:32
@@ -120343,10 +120285,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -120574,20 +120513,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -120599,7 +120524,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:88
@@ -120634,7 +120559,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    v_writelane_b32 v77, s101, 5
 ; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x13
+; GFX11-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:68
@@ -121542,7 +121467,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x13
+; GFX11-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:8
@@ -121605,7 +121530,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    v_readlane_b32 s31, v76, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v76, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
 ; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:88
@@ -121818,13 +121743,26 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -121985,44 +121923,30 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB74_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -122031,11 +121955,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -122638,7 +122562,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB74_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -122652,8 +122575,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -123333,13 +123256,25 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -123476,34 +123411,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -123989,7 +123910,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -124567,13 +124487,27 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -124715,34 +124649,20 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -125229,7 +125149,6 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -126368,7 +126287,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -126401,7 +126320,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -127342,7 +127261,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB74_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -127375,7 +127294,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -128092,24 +128011,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB75_3
 ; SI-NEXT:  .LBB75_2:
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -128120,10 +128028,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB75_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -128133,7 +128053,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -128728,13 +128647,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -128962,11 +128881,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -128976,11 +128895,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -128988,6 +128904,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -129196,12 +129114,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB75_3
 ; VI-NEXT:  .LBB75_2:
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -129219,6 +129131,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -129228,7 +129146,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -129241,7 +129159,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -129826,8 +129743,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -130006,16 +129923,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -130042,9 +129961,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -130060,14 +129978,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -130079,10 +129999,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130095,10 +130016,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -130112,17 +130035,22 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -130138,45 +130066,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130227,18 +130134,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB75_3
 ; GFX9-NEXT:  .LBB75_2:
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -130252,6 +130147,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -130689,7 +130596,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -130722,7 +130629,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -131579,7 +131486,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -131612,7 +131519,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -131630,7 +131537,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -131663,7 +131570,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -132520,7 +132427,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -132553,7 +132460,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -132588,22 +132495,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -132672,6 +132563,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -132703,7 +132610,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -132713,7 +132620,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -132843,7 +132750,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB76_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
@@ -133081,10 +132987,25 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -133263,22 +133184,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -133966,8 +133871,22 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v40
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -134055,20 +133974,6 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -136071,7 +135976,10 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -136088,9 +135996,6 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -136656,7 +136561,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -139123,7 +139028,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -139156,7 +139061,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -139189,7 +139094,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -139991,7 +139896,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -140024,7 +139929,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -140057,7 +139962,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6
+; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -140097,7 +140002,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -140130,7 +140035,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -140163,7 +140068,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -140855,7 +140760,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -140888,7 +140793,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -140921,7 +140826,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8
+; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -140978,22 +140883,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -141062,6 +140951,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr47
@@ -141093,7 +140998,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -141144,7 +141049,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v52
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
@@ -141314,7 +141218,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
 ; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v9
@@ -141662,8 +141565,24 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -141712,22 +141631,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -142372,6 +142275,22 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -142391,22 +142310,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB81_4:
@@ -144567,7 +144470,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -144600,7 +144503,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -144633,7 +144536,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -144711,7 +144614,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -144744,7 +144647,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -144777,7 +144680,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -144835,6 +144738,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -144851,10 +144758,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -144886,14 +144789,13 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB84_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -144937,7 +144839,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB84_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
@@ -145149,6 +145050,22 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -145175,7 +145092,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -145188,22 +145105,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -145607,7 +145508,23 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
@@ -145668,22 +145585,6 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB85_4:
@@ -146031,7 +145932,13 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -146063,12 +145970,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -146084,7 +145985,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -146301,7 +146201,6 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -147475,7 +147374,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -147508,7 +147407,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -147541,7 +147440,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -147619,7 +147518,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -147652,7 +147551,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -147685,7 +147584,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -147895,6 +147794,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v25
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:264
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr9
@@ -147904,7 +147805,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
@@ -147944,38 +147845,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200
@@ -147991,11 +147893,12 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -148017,14 +147920,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316
@@ -148032,11 +147927,15 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v1
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v3
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340
@@ -148045,9 +147944,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:328
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v1
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 24, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372
@@ -148057,7 +147958,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:364
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:360
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 24, v1
@@ -149557,10 +149458,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -149857,22 +149773,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -149940,8 +149840,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -150037,13 +149937,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -150171,14 +150083,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150186,26 +150103,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -150214,35 +150111,57 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150275,39 +150194,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150473,17 +150372,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -151168,8 +151059,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -151280,13 +151171,27 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -151419,14 +151324,19 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151434,26 +151344,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -151462,36 +151352,62 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151514,49 +151430,25 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151722,17 +151614,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -153078,7 +152962,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -153111,7 +152995,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -153940,7 +153824,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB88_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -153973,7 +153857,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -154018,7 +153902,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -154029,7 +153912,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_mov_b32 s72, s21
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s19, 0
 ; SI-NEXT:    v_writelane_b32 v43, s18, 1
 ; SI-NEXT:    v_writelane_b32 v43, s17, 2
@@ -154070,10 +153953,16 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
+; SI-NEXT:    s_mov_b32 s79, s26
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
-; SI-NEXT:    s_mov_b32 s79, s26
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_readfirstlane_b32 s38, v20
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s39, v19
@@ -154100,9 +153989,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s18, v5
 ; SI-NEXT:    v_readfirstlane_b32 s19, v6
 ; SI-NEXT:    v_readfirstlane_b32 s88, v4
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    v_readfirstlane_b32 s90, v9
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
@@ -154110,33 +153997,31 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 4
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
 ; SI-NEXT:    v_writelane_b32 v43, s4, 5
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 6
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 7
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
 ; SI-NEXT:    v_writelane_b32 v43, s4, 8
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
 ; SI-NEXT:    v_writelane_b32 v43, s4, 9
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 10
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
 ; SI-NEXT:    v_readfirstlane_b32 s91, v10
 ; SI-NEXT:    v_readfirstlane_b32 s92, v8
 ; SI-NEXT:    v_readfirstlane_b32 s93, v7
@@ -154219,44 +154104,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s24, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s78, v34
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    v_writelane_b32 v43, s4, 20
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 22
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
 ; SI-NEXT:    v_writelane_b32 v43, s4, 23
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
 ; SI-NEXT:    v_writelane_b32 v43, s4, 24
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
 ; SI-NEXT:    v_writelane_b32 v43, s4, 25
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
 ; SI-NEXT:    v_writelane_b32 v43, s4, 26
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
 ; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -154270,7 +154152,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
 ; SI-NEXT:    v_writelane_b32 v43, s4, 28
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v52
 ; SI-NEXT:    v_writelane_b32 v43, s4, 29
 ; SI-NEXT:    v_readfirstlane_b32 s4, v53
@@ -154279,7 +154161,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v43, s4, 31
 ; SI-NEXT:    v_readfirstlane_b32 s4, v55
 ; SI-NEXT:    v_writelane_b32 v43, s4, 32
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v40
 ; SI-NEXT:    v_writelane_b32 v43, s4, 33
 ; SI-NEXT:    v_writelane_b32 v43, s22, 34
@@ -155894,33 +155775,53 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -155965,52 +155866,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -156030,6 +155885,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -156038,7 +155894,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -156070,6 +155925,25 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB89_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -156094,15 +155968,18 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -156152,10 +156029,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -156163,50 +156041,37 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -156221,13 +156086,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -156249,21 +156113,28 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -156281,11 +156152,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -156318,7 +156188,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_branch .LBB89_3
 ; VI-NEXT:  .LBB89_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -156339,6 +156208,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -156930,29 +156800,51 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -157016,82 +156908,42 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -157112,6 +156964,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB89_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -157365,14 +157224,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -157382,7 +157240,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:  .LBB89_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -157394,6 +157251,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -157859,7 +157717,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -158589,7 +158447,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -158631,7 +158489,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -159415,7 +159273,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -161437,7 +161295,23 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
@@ -161484,28 +161358,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -161522,9 +161383,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr34
@@ -161713,166 +161571,165 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v29
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v28
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v28
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v27
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v27
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v26
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v25
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v24
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v23
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v22
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[3:4]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v21
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v17
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[3:4]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v18
 ; VI-NEXT:    v_mov_b32_e32 v45, v46
 ; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v17
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v46, v63
 ; VI-NEXT:    v_mov_b32_e32 v63, v50
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
 ; VI-NEXT:    v_mov_b32_e32 v51, v57
 ; VI-NEXT:    v_mov_b32_e32 v50, v56
 ; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
 ; VI-NEXT:    v_mov_b32_e32 v57, v43
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[17:18]
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
@@ -161885,6 +161742,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v32
@@ -162518,27 +162376,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[21:22]
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v28
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v28
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v27
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v27
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v26
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v26
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
@@ -162923,9 +162781,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -163020,16 +162879,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -163046,6 +162895,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
+; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -163282,49 +163141,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -163338,6 +163159,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v63
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v62
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v62
@@ -163355,130 +163177,168 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[15:16]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[11:12]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[7:8]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[3:4]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[1:2]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[62:63]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[27:28]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[17:18]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -163571,16 +163431,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v19, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX9-NEXT:    v_mov_b32_e32 v59, v32
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
-; GFX9-NEXT:    v_mov_b32_e32 v58, v31
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v14, v13, v0, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v19, vcc
@@ -163735,7 +163590,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(52)
+; GFX9-NEXT:    s_waitcnt vmcnt(50)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v62
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v44, v18, v19, vcc
@@ -163750,7 +163605,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -163891,8 +163745,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v59, v32
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT:    v_mov_b32_e32 v58, v31
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
 ; GFX9-NEXT:    v_or_b32_e32 v31, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
@@ -163958,6 +163814,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v41, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v61, v28, v0, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v41, vcc
 ; GFX9-NEXT:    v_add3_u32 v31, v31, v13, s6
@@ -163965,7 +163822,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc
 ; GFX9-NEXT:    v_perm_b32 v41, v13, v0, s7
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
@@ -163994,24 +163851,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v45, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v15, v31, v45, vcc
+; GFX9-NEXT:    v_perm_b32 v32, v16, v13, s7
 ; GFX9-NEXT:    v_perm_b32 v31, v15, v26, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
-; GFX9-NEXT:    v_perm_b32 v32, v16, v13, s7
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
-; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
-; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -164031,12 +163878,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v34, v30, v27, s7
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v36, v44, v29, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v29
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
+; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
 ; GFX9-NEXT:    v_perm_b32 v38, v21, v43, s7
@@ -164045,6 +163899,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
+; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v51, v6, v17, s7
 ; GFX9-NEXT:    v_perm_b32 v40, v10, v7, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
@@ -164052,12 +163924,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v57
-; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v47
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -164101,7 +163969,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
@@ -164134,74 +164001,51 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[35:36]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[33:34]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[60:61]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[62:63]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[43:44]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[33:34]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[60:61]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[62:63]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v53
@@ -164214,15 +164058,26 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v37
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v36
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[43:44]
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
@@ -164231,6 +164086,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v60
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v60
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -164255,31 +164113,33 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v49
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v48
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v63, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v35
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v63, v16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
 ; GFX9-NEXT:    v_mov_b32_e32 v62, v15
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v34
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v44
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v43
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v34
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v59
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v58
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(19)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v60
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v60
@@ -164294,6 +164154,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 8, v54
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v32, 8, v32
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v31, 8, v31
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -164302,6 +164166,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v31, 8, v39
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v51
@@ -164310,38 +164176,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v52
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v11
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 8, v12
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 8, v14
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v43
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v60
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v31
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -164516,13 +164373,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v55
-; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v55
+; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:80
@@ -164533,13 +164390,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v57
-; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v57
+; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:88
@@ -164679,7 +164536,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v80, off, s32
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:240
@@ -164712,7 +164573,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1b
+; GFX11-TRUE16-NEXT:    s_clause 0x1b ; 112-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:112
@@ -164741,10 +164602,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:12
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v80, off, s32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr152_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
@@ -165778,7 +165635,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:20
@@ -165811,7 +165668,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    s_clause 0x1b
+; GFX11-TRUE16-NEXT:    s_clause 0x1b ; 112-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:140
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:148
@@ -165846,7 +165703,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x15 ; 88-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:88
@@ -165869,10 +165730,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -166991,7 +166848,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    s_clause 0x15 ; 88-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:20
@@ -168663,13 +168520,26 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v46
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s5, s86, 24
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s86, v63, 30
 ; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -168708,20 +168578,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -169924,6 +169780,15 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v43, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 13
 ; VI-NEXT:    v_mov_b32_e32 v46, s4
+; VI-NEXT:    v_mov_b32_e32 v45, s72
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v45, s74
+; VI-NEXT:    v_mov_b32_e32 v42, s54
+; VI-NEXT:    v_mov_b32_e32 v41, s46
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v41, s56
 ; VI-NEXT:    v_readlane_b32 s4, v62, 14
 ; VI-NEXT:    v_mov_b32_e32 v50, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 15
@@ -169949,6 +169814,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s4, v62, 22
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v45, s76
 ; VI-NEXT:    v_readlane_b32 s4, v62, 23
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
@@ -169994,6 +169864,8 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
@@ -170052,45 +169924,47 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
-; VI-NEXT:    v_mov_b32_e32 v42, s54
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_mov_b32_e32 v41, s46
+; VI-NEXT:    v_mov_b32_e32 v36, s66
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s56
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s58
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s60
-; VI-NEXT:    v_mov_b32_e32 v45, s72
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s74
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s76
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v45, s78
 ; VI-NEXT:    v_mov_b32_e32 v55, s88
+; VI-NEXT:    v_mov_b32_e32 v35, s30
+; VI-NEXT:    v_mov_b32_e32 v41, s58
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v36, s66
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v35, s85
+; VI-NEXT:    v_mov_b32_e32 v34, s38
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v52, s64
-; VI-NEXT:    v_mov_b32_e32 v55, v50
-; VI-NEXT:    v_mov_b32_e32 v35, s30
 ; VI-NEXT:    v_mov_b32_e32 v59, s87
+; VI-NEXT:    v_mov_b32_e32 v41, s60
+; VI-NEXT:    v_mov_b32_e32 v55, v50
 ; VI-NEXT:    v_mov_b32_e32 v58, s34
 ; VI-NEXT:    v_mov_b32_e32 v45, s36
-; VI-NEXT:    v_mov_b32_e32 v34, s38
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v50, v46
+; VI-NEXT:    v_mov_b32_e32 v46, v48
+; VI-NEXT:    v_mov_b32_e32 v48, v47
+; VI-NEXT:    v_mov_b32_e32 v47, v56
+; VI-NEXT:    v_mov_b32_e32 v56, v51
+; VI-NEXT:    v_mov_b32_e32 v51, s90
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v34, s48
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
 ; VI-NEXT:    v_mov_b32_e32 v2, s45
 ; VI-NEXT:    v_mov_b32_e32 v3, s42
@@ -170123,37 +169997,19 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s29
 ; VI-NEXT:    v_mov_b32_e32 v32, s5
 ; VI-NEXT:    v_mov_b32_e32 v41, s62
+; VI-NEXT:    v_mov_b32_e32 v51, v53
+; VI-NEXT:    v_mov_b32_e32 v53, v54
+; VI-NEXT:    v_mov_b32_e32 v54, v40
+; VI-NEXT:    v_mov_b32_e32 v40, s80
 ; VI-NEXT:    v_mov_b32_e32 v57, s81
 ; VI-NEXT:    v_mov_b32_e32 v37, s84
+; VI-NEXT:    v_mov_b32_e32 v58, s50
 ; VI-NEXT:    v_mov_b32_e32 v60, s52
 ; VI-NEXT:    v_mov_b32_e32 v38, s51
 ; VI-NEXT:    v_mov_b32_e32 v61, s65
 ; VI-NEXT:    v_mov_b32_e32 v49, s66
-; VI-NEXT:    v_mov_b32_e32 v39, s55
-; VI-NEXT:    v_mov_b32_e32 v50, v46
-; VI-NEXT:    v_mov_b32_e32 v46, v48
-; VI-NEXT:    v_mov_b32_e32 v48, v47
-; VI-NEXT:    v_mov_b32_e32 v47, v56
-; VI-NEXT:    v_mov_b32_e32 v56, v51
-; VI-NEXT:    v_mov_b32_e32 v51, s90
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v35, s85
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v34, s48
-; VI-NEXT:    v_mov_b32_e32 v51, v53
-; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    v_mov_b32_e32 v54, v40
-; VI-NEXT:    v_mov_b32_e32 v40, s80
-; VI-NEXT:    v_mov_b32_e32 v58, s50
 ; VI-NEXT:    v_mov_b32_e32 v45, s53
+; VI-NEXT:    v_mov_b32_e32 v39, s55
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB91_5: ; %end
@@ -170462,9 +170318,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -170542,9 +170399,23 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -170562,20 +170433,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -172164,11 +172021,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v12
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -172183,6 +172036,10 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -172194,7 +172051,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
@@ -173744,7 +173601,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
@@ -173757,7 +173614,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
@@ -175314,7 +175171,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
@@ -175488,9 +175345,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v37
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v38
@@ -175508,6 +175362,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
@@ -175525,15 +175382,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:208
@@ -175669,34 +175526,37 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:384
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
@@ -175716,7 +175576,10 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:120
@@ -175726,7 +175589,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216
@@ -175752,14 +175617,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
@@ -175882,7 +175739,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v47
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v42
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
@@ -176540,25 +176396,18 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
 ; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
 ; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
 ; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
 ; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
@@ -177265,9 +177114,24 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -177574,22 +177438,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -177657,8 +177505,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -177754,13 +177602,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -177888,14 +177748,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177903,26 +177768,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -177931,35 +177776,57 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177992,39 +177859,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -178190,17 +178037,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -178885,8 +178724,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -178997,13 +178836,27 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -179136,14 +178989,19 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179151,26 +179009,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -179179,36 +179017,62 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179231,49 +179095,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179439,17 +179279,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -180795,7 +180627,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -180828,7 +180660,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -181657,7 +181489,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB92_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -181690,7 +181522,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -183232,17 +183064,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v60
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x78, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
@@ -183256,6 +183077,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x78, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
@@ -183515,33 +183347,53 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -183586,52 +183438,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -183651,6 +183457,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -183659,7 +183466,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -183691,6 +183497,25 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB93_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -183715,15 +183540,18 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -183773,10 +183601,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -183784,50 +183613,37 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -183842,13 +183658,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -183870,21 +183685,28 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -183902,11 +183724,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -183939,7 +183760,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_branch .LBB93_3
 ; VI-NEXT:  .LBB93_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -183960,6 +183780,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -184551,29 +184372,51 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -184637,82 +184480,42 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -184733,6 +184536,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB93_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -184986,14 +184796,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -185003,7 +184812,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:  .LBB93_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -185015,6 +184823,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -185480,7 +185289,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -186210,7 +186019,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -186252,7 +186061,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -187036,7 +186845,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -189053,13 +188862,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -189076,6 +188878,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -189098,27 +188907,42 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v44, v12
 ; VI-NEXT:    v_mov_b32_e32 v12, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v32, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v43, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
-; VI-NEXT:    v_mov_b32_e32 v32, v20
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, v22
 ; VI-NEXT:    v_mov_b32_e32 v54, v21
 ; VI-NEXT:    v_mov_b32_e32 v31, v19
+; VI-NEXT:    v_mov_b32_e32 v43, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v44
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v43
 ; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr8
+; VI-NEXT:    ; implicit-def: $vgpr4
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    ; implicit-def: $vgpr63
@@ -189130,47 +188954,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v43
-; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT:    ; implicit-def: $vgpr8
-; VI-NEXT:    ; implicit-def: $vgpr15
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
+; VI-NEXT:    ; implicit-def: $vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr13
 ; VI-NEXT:    ; implicit-def: $vgpr9
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr5
-; VI-NEXT:    ; implicit-def: $vgpr4
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr30
+; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
@@ -189179,38 +188994,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr29
 ; VI-NEXT:    ; implicit-def: $vgpr28
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT:    ; implicit-def: $vgpr27
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; VI-NEXT:    ; implicit-def: $vgpr25
-; VI-NEXT:    ; implicit-def: $vgpr24
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr23
-; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr0
@@ -189254,8 +189037,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    ; kill: killed $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    ; kill: killed $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr0
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT:    ; implicit-def: $vgpr27
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr24
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr23
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr10
@@ -189293,28 +189102,49 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v56, v38
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v45, v7
-; VI-NEXT:    v_mov_b32_e32 v63, v53
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, v3
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v28, v48
 ; VI-NEXT:    v_mov_b32_e32 v48, v16
 ; VI-NEXT:    v_mov_b32_e32 v16, v40
 ; VI-NEXT:    v_mov_b32_e32 v47, v39
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v63, v53
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v32
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v17
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v29, 24, v44
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 24, v18
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v38
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v37
@@ -189326,83 +189156,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v62, v36
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v11
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v10
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v62, v36
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v17
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[37:38]
@@ -189417,61 +189184,94 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[2:3]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[26:27]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
-; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[52:53]
 ; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[58:59]
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
 ; VI-NEXT:    v_mov_b32_e32 v53, v63
-; VI-NEXT:    v_mov_b32_e32 v27, v19
-; VI-NEXT:    v_mov_b32_e32 v34, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v40
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
 ; VI-NEXT:    v_mov_b32_e32 v7, v45
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
-; VI-NEXT:    v_mov_b32_e32 v3, v15
-; VI-NEXT:    v_mov_b32_e32 v15, v29
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT:    v_mov_b32_e32 v38, v56
-; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v45, v60
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[2:3]
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[26:27]
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v50
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[49:50]
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[39:40]
 ; VI-NEXT:    v_mov_b32_e32 v58, v51
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
 ; VI-NEXT:    v_mov_b32_e32 v36, v62
 ; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[54:55]
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v40
-; VI-NEXT:    v_mov_b32_e32 v40, v16
-; VI-NEXT:    v_mov_b32_e32 v16, v48
-; VI-NEXT:    v_mov_b32_e32 v48, v28
-; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v27, v19
+; VI-NEXT:    v_mov_b32_e32 v34, v14
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v40, v16
+; VI-NEXT:    v_mov_b32_e32 v16, v48
+; VI-NEXT:    v_mov_b32_e32 v48, v28
+; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
+; VI-NEXT:    v_mov_b32_e32 v3, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v38, v56
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v39
+; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v39, v47
 ; VI-NEXT:    v_mov_b32_e32 v47, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v54
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
 ; VI-NEXT:  .LBB94_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB94_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v63, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v21
@@ -189490,36 +189290,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v31, 0x200, v31
 ; VI-NEXT:    v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_e32 v14, v31, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_e32 v62, v55, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
 ; VI-NEXT:    v_or_b32_e32 v61, v54, v0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v26, v54
 ; VI-NEXT:    v_mov_b32_e32 v27, v55
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v60
 ; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
 ; VI-NEXT:    v_or_b32_e32 v34, v25, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v33, v24, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -189527,13 +189338,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v36, v2, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v35, v1, v0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
+; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -189542,38 +189361,34 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v38, v2, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v37, v1, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    v_or_b32_e32 v49, v9, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT:    v_or_b32_e32 v49, v9, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; VI-NEXT:    v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v48, v8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v47
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v31
 ; VI-NEXT:    v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v10, v32
@@ -189591,11 +189406,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v53, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v44, 0x200, v44
 ; VI-NEXT:    v_or_b32_e32 v52, v1, v0
@@ -189612,28 +189427,32 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v46, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; VI-NEXT:    v_or_b32_e32 v45, v1, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v5, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v4, v6, v0
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
@@ -189641,36 +189460,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v41, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v40, v6, v0
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v42
 ; VI-NEXT:    v_or_b32_e32 v7, v25, v0
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v46
 ; VI-NEXT:    v_or_b32_e32 v6, v24, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
@@ -189679,7 +189475,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v31, v43, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v28
 ; VI-NEXT:    v_or_b32_e32 v30, v2, v0
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -189695,8 +189490,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v0
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
@@ -189714,21 +189507,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; VI-NEXT:    v_mov_b32_e32 v32, v10
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
-; VI-NEXT:    v_mov_b32_e32 v32, v10
 ; VI-NEXT:    v_mov_b32_e32 v31, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v41
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
 ; VI-NEXT:    v_mov_b32_e32 v7, v11
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[40:41]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, v27
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v54, v26
 ; VI-NEXT:    v_mov_b32_e32 v26, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v5
@@ -189736,23 +189529,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v5, v22
 ; VI-NEXT:    v_mov_b32_e32 v13, v21
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[45:46]
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v53
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v52
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[50:51]
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v50
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v48
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[48:49]
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v49
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v36
@@ -189760,27 +189544,39 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[35:36]
 ; VI-NEXT:    v_mov_b32_e32 v36, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v62
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v61
-; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[61:62]
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v14
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v53
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v52
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v50
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v49
 ; VI-NEXT:    v_mov_b32_e32 v48, v56
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v33
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[33:34]
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[14:15]
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v58
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v62
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v61
+; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[61:62]
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v57
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v23
 ; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v40
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v14, v8
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v40, v42
 ; VI-NEXT:    v_bfe_u32 v8, v42, 8, 8
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v38
 ; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v37
 ; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[37:38]
@@ -189797,26 +189593,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_bfe_u32 v51, v48, 8, 8
 ; VI-NEXT:    v_bfe_u32 v57, v7, 8, 8
 ; VI-NEXT:    v_bfe_u32 v58, v60, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_bfe_u32 v2, v2, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_bfe_u32 v34, v47, 8, 8
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v9, v9, 8, 8
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v5, v5, 8, 8
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v13, v13, 8, 8
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_bfe_u32 v42, v0, 8, 8
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    v_bfe_u32 v34, v47, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
@@ -189866,27 +189660,25 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v25
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 12, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v24
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 16, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v20
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v57
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189898,9 +189690,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v23
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 24, v12
@@ -189953,7 +189749,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
 ; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189962,14 +189760,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 48, v12
@@ -189991,11 +189786,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 56, v12
@@ -190003,7 +189795,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v28
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -190053,7 +189847,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v61
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x50, v12
@@ -190067,12 +189863,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x54, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v49
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -190088,35 +189881,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x5c, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x60, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v30
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v47
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x64, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v22
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x68, v12
@@ -190135,13 +189928,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -190161,12 +189951,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x74, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -190176,15 +189964,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x78, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v45
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7c, v12
-; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -190201,28 +189981,20 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7c, v12
+; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v64f16_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -190285,6 +190057,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -190315,7 +190104,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -190349,7 +190137,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -190472,101 +190260,100 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(46)
+; GFX9-NEXT:    s_waitcnt vmcnt(62)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[19:20]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v10
@@ -190582,6 +190369,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
@@ -190607,7 +190395,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
 ; GFX9-NEXT:    v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
@@ -191148,17 +190936,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -191175,6 +190953,18 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -191633,7 +191423,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -191654,10 +191448,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
@@ -192293,7 +192083,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -193940,13 +193730,27 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    v_mov_b32_e32 v4, s6
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v47
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_and_b32 s6, s7, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s51, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v47
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s6, v1
@@ -193979,21 +193783,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s45, v62, 17
 ; SI-NEXT:    v_readlane_b32 s43, v62, 23
 ; SI-NEXT:    v_readlane_b32 s41, v62, 29
@@ -194001,6 +193790,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readlane_b32 s27, v62, 41
 ; SI-NEXT:    v_readlane_b32 s25, v62, 45
 ; SI-NEXT:    v_readlane_b32 s9, v62, 49
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_readlane_b32 s98, v63, 34
 ; SI-NEXT:    v_readlane_b32 s97, v63, 33
@@ -194483,8 +194273,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
@@ -194492,6 +194280,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v13
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v12
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[12:13]
@@ -194499,12 +194288,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[1:2]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v19
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
@@ -194512,14 +194295,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[3:4]
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
 ; VI-NEXT:    v_lshrrev_b64 v[16:17], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v18
 ; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[18:19]
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v31
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v15
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v35
 ; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[34:35]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -194554,6 +194343,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_bfe_u32 v11, v52, 8, 8
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v33
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v32
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v29
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
@@ -195163,9 +194953,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v27
 ; VI-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -195174,13 +194966,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -195312,9 +195101,22 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -195382,20 +195184,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
@@ -195713,42 +195501,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
@@ -196400,9 +196188,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
@@ -196434,10 +196224,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -196688,8 +196475,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v24
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -196704,6 +196489,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -196715,7 +196502,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -196750,7 +196537,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 s99, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -197669,7 +197456,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -197731,7 +197518,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -197782,11 +197569,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_mov_b32_e32 v57, v5
 ; SI-NEXT:    v_mov_b32_e32 v41, v3
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:392
@@ -197876,7 +197663,30 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -197884,28 +197694,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v17
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
+; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr27
@@ -197913,240 +197716,211 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:184
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:312
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:320
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:380
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:376
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:372
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:364
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
@@ -198158,15 +197932,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v35
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
@@ -198202,7 +197980,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
@@ -198682,15 +198460,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v25, v6, v13
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v5
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v7, v25, v5, 16
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v26
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v11
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -199918,14 +199696,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -199942,6 +199714,12 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -200009,8 +199787,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -200106,13 +199884,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -200240,14 +200030,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200255,26 +200050,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -200283,35 +200058,57 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200344,39 +200141,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200542,17 +200319,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -201237,8 +201006,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -201349,13 +201118,27 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -201488,14 +201271,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201503,26 +201291,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -201531,36 +201299,62 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201583,49 +201377,25 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201791,17 +201561,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -203147,7 +202909,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -203180,7 +202942,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -204009,7 +203771,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB96_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -204042,7 +203804,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -204087,7 +203849,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -204097,9 +203858,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_writelane_b32 v41, s30, 0
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
@@ -204148,6 +203909,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_readfirstlane_b32 s39, v26
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s47, v12
@@ -204170,9 +203937,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s59, v28
 ; SI-NEXT:    v_readfirstlane_b32 s60, v27
 ; SI-NEXT:    v_readfirstlane_b32 s11, v1
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_readfirstlane_b32 s13, v9
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
@@ -204181,30 +203946,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s44, v36
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s90, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s6, v38
+; SI-NEXT:    v_readfirstlane_b32 s12, v2
+; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s14, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v8
 ; SI-NEXT:    v_readfirstlane_b32 s18, v7
@@ -204218,6 +203981,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s77, v15
 ; SI-NEXT:    v_readfirstlane_b32 s38, v25
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_readfirstlane_b32 s93, v55
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s95, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
@@ -204294,39 +204061,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v43, s4, 30
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 31
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 32
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s9, v35
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    v_writelane_b32 v43, s4, 33
 ; SI-NEXT:    v_readfirstlane_b32 s10, v36
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 34
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 35
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
 ; SI-NEXT:    v_writelane_b32 v43, s4, 36
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s69, v48
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s30, v49
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s16, v50
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s36, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -204340,7 +204103,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
 ; SI-NEXT:    v_writelane_b32 v43, s4, 37
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v52
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
 ; SI-NEXT:    v_readfirstlane_b32 s4, v53
@@ -204367,9 +204130,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v43, s43, 58
 ; SI-NEXT:    v_writelane_b32 v43, s76, 59
 ; SI-NEXT:    v_writelane_b32 v43, s77, 60
-; SI-NEXT:    v_readfirstlane_b32 s93, v55
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s95, v40
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s17, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
@@ -205938,33 +205698,53 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -206009,52 +205789,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -206074,6 +205808,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -206082,7 +205817,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -206114,6 +205848,25 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -206138,15 +205891,18 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -206196,10 +205952,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -206207,50 +205964,37 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -206265,13 +206009,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -206293,21 +206036,28 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -206325,11 +206075,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -206362,7 +206111,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_branch .LBB97_3
 ; VI-NEXT:  .LBB97_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -206383,6 +206131,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -206974,29 +206723,51 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -207060,82 +206831,42 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
-; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -207156,6 +206887,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(55)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB97_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -207409,14 +207147,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -207426,7 +207163,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:  .LBB97_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -207438,6 +207174,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -207903,7 +207640,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -208633,7 +208370,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e
+; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -208675,7 +208412,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -209459,7 +209196,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e
+; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -209562,100 +209299,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; SI-NEXT:    ; kill: killed $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; kill: killed $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
@@ -209785,14 +209428,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209809,13 +209467,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209870,12 +209521,39 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr18
@@ -209885,36 +209563,81 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
 ; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v62
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v60
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v59
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
@@ -209936,6 +209659,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v47
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v58
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB98_2
@@ -211507,9 +211242,25 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v47
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -211533,44 +211284,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64i16_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
@@ -211588,6 +211307,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
 ; VI-NEXT:    ; kill: killed $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
 ; VI-NEXT:    ; kill: killed $vgpr35
@@ -211884,14 +211619,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v8
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v6
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
@@ -211923,10 +211656,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v5, v4
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v30
@@ -211997,10 +211726,16 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, v18
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v5, v4
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, v46
 ; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v24
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 24, v22
@@ -212201,9 +211936,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v15
 ; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v13, v41, v13
@@ -212211,38 +211943,35 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v13
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v9
 ; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
@@ -212255,8 +211984,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
@@ -212325,6 +212052,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v49, v53
 ; VI-NEXT:    v_mov_b32_e32 v53, v38
 ; VI-NEXT:    v_mov_b32_e32 v38, v55
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v18
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v17
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
@@ -212336,6 +212064,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v55, v31
 ; VI-NEXT:    v_bfe_u32 v61, v53, 8, 8
 ; VI-NEXT:    v_bfe_u32 v31, v38, 8, 8
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB98_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -212743,9 +212478,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -212768,44 +212518,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v64i16_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -212868,6 +212586,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -212898,7 +212633,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -212932,7 +212666,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -213055,101 +212789,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(46)
+; GFX9-NEXT:    s_waitcnt vmcnt(62)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[19:20]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v10
@@ -213165,6 +212898,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
@@ -213189,7 +212923,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
 ; GFX9-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
@@ -213730,17 +213464,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -213757,6 +213481,18 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -214215,7 +213951,11 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -214236,10 +213976,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
@@ -214875,7 +214611,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -215014,26 +214750,26 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s91, v32
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s93, v33
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s55, v34
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s17, v35
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s95, v36
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s35, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s83, v38
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80
@@ -215046,39 +214782,34 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s39, v1
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s77, v31
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s38, v32
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s48, v33
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s50, v39
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s76, v48
+; SI-NEXT:    v_readfirstlane_b32 s77, v31
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s30, v49
+; SI-NEXT:    v_readfirstlane_b32 s38, v32
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s34, v50
+; SI-NEXT:    v_readfirstlane_b32 s48, v33
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_readfirstlane_b32 s36, v51
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s99, v34
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s50, v39
 ; SI-NEXT:    v_readfirstlane_b32 s90, v35
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s92, v36
 ; SI-NEXT:    v_writelane_b32 v41, s90, 11
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s94, v37
 ; SI-NEXT:    v_writelane_b32 v41, s92, 12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s30, v49
 ; SI-NEXT:    v_writelane_b32 v41, s94, 13
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_readfirstlane_b32 s34, v50
 ; SI-NEXT:    v_writelane_b32 v41, s30, 14
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s36, v51
 ; SI-NEXT:    v_writelane_b32 v41, s34, 15
 ; SI-NEXT:    v_writelane_b32 v41, s36, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v38
 ; SI-NEXT:    v_writelane_b32 v41, s38, 17
+; SI-NEXT:    v_readfirstlane_b32 s76, v48
+; SI-NEXT:    v_readfirstlane_b32 s99, v34
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_writelane_b32 v41, s48, 18
 ; SI-NEXT:    v_writelane_b32 v41, s50, 19
@@ -218060,48 +217791,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT:    v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT:    v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT:    v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
+; GFX9-NEXT:    v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
@@ -218753,9 +218484,11 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
@@ -218787,10 +218520,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -219041,8 +218771,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v24
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -219057,6 +218785,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -219068,7 +218798,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -219103,7 +218833,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 s99, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -220022,7 +219752,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -220084,7 +219814,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -221102,9 +220832,24 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -221449,28 +221194,14 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v64f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -221487,9 +221218,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -221738,7 +221467,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -222104,6 +221832,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64bf16_to_v64f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -222120,9 +221851,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -222341,7 +222070,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; GFX9-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX9-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -222641,7 +222370,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -223201,7 +222930,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47
-; GFX11-TRUE16-NEXT:    s_clause 0xf
+; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -224930,10 +224659,26 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -224966,22 +224711,6 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -228962,7 +228691,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v47
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -228979,6 +228707,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -230321,20 +230050,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v37
 ; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -230351,6 +230067,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v37
+; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v37
+; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -231398,17 +231127,32 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v13
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v5, v23, v5, 16
 ; SI-NEXT:    v_alignbit_b32 v2, v21, v2, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
 ; SI-NEXT:    v_alignbit_b32 v1, v61, v1, 16
+; SI-NEXT:    v_alignbit_b32 v5, v23, v5, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
@@ -231418,57 +231162,63 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v7, v24, v7, 16
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v25, v45, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v9
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_alignbit_b32 v62, v63, v16, 16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v33
 ; SI-NEXT:    v_alignbit_b32 v16, v62, v16, 16
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_alignbit_b32 v22, v34, v9, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v9, v22, v9, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
@@ -231526,31 +231276,8 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v15, v41, v15, 16
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB104_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -231808,7 +231535,23 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v61
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
@@ -231833,28 +231576,14 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v64i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -231871,9 +231600,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -232122,7 +231849,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -232488,6 +232214,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64bf16_to_v64i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -232504,9 +232233,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -232725,7 +232452,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; GFX9-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX9-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -234330,15 +234057,21 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v57, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v34
-; SI-NEXT:    v_mov_b32_e32 v57, v13
 ; SI-NEXT:    v_mov_b32_e32 v40, v3
 ; SI-NEXT:    v_mov_b32_e32 v54, v50
 ; SI-NEXT:    v_mov_b32_e32 v46, v19
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
 ; SI-NEXT:    v_mov_b32_e32 v44, v15
 ; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v59
@@ -234372,32 +234105,24 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v42, v43
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; SI-NEXT:    v_mov_b32_e32 v5, v19
+; SI-NEXT:    v_mov_b32_e32 v7, v15
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; SI-NEXT:    v_mov_b32_e32 v5, v19
-; SI-NEXT:    v_mov_b32_e32 v7, v15
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
@@ -234533,9 +234258,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v50
-; SI-NEXT:    v_mov_b32_e32 v56, v47
 ; SI-NEXT:    v_mov_b32_e32 v9, v11
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v53, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
@@ -234543,6 +234266,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v56, v47
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v40, v3
 ; SI-NEXT:    v_mov_b32_e32 v44, v15
 ; SI-NEXT:    v_mov_b32_e32 v57, v13
@@ -234850,16 +234575,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshr_b64 v[51:52], v[25:26], 16
 ; SI-NEXT:    v_lshr_b64 v[52:53], v[1:2], 16
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_alignbit_b32 v16, v45, v16, 16
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
 ; SI-NEXT:    v_alignbit_b32 v28, v58, v27, 16
@@ -234917,19 +234644,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
 ; SI-NEXT:  .LBB105_5: ; %end
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -234955,10 +234679,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
@@ -234985,12 +234707,11 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
@@ -235034,7 +234755,25 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -235049,10 +234788,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
@@ -235067,10 +234806,8 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
@@ -235078,17 +234815,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
@@ -235096,17 +234834,18 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
@@ -235185,22 +234924,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -238415,7 +238138,23 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
@@ -238580,22 +238319,6 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -240180,38 +239903,39 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v23
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v25
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v24
@@ -240222,7 +239946,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v29
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v47
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v38
@@ -240804,16 +240527,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -240830,6 +240543,16 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -241300,10 +241023,12 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -241315,7 +241040,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT:    v_mov_b32_e32 v54, v15
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v42
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -241325,8 +241067,13 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v26, v3, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
@@ -241335,39 +241082,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v22, v3, v5
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v18, v3, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v42
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
@@ -241385,8 +241115,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v14, v3, v5
@@ -241430,11 +241158,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
@@ -241571,27 +241294,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v12, v50, v1
 ; SI-NEXT:    v_lshr_b64 v[49:50], v[35:36], 16
-; SI-NEXT:    v_mov_b32_e32 v35, v44
-; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
 ; SI-NEXT:    v_lshr_b64 v[50:51], v[21:22], 16
-; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[42:43], 16
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[9:10], 16
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v35, v44
+; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[40:41], 16
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
 ; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
 ; SI-NEXT:    v_mov_b32_e32 v42, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v37
@@ -241685,17 +241408,18 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -241727,7 +241451,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
@@ -241742,10 +241468,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
@@ -241758,25 +241484,24 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
@@ -241805,9 +241530,25 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -241825,22 +241566,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -242878,9 +242603,24 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -243225,22 +242965,6 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -244323,15 +244047,8 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -244348,6 +244065,13 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 9041f64cb17fb..e688681c5ad09 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -15670,8 +15670,25 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v28
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v40
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -15681,7 +15698,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v40
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -15711,22 +15727,6 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15947,16 +15947,16 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
 ; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
+; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17964,14 +17964,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v20i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, v10
 ; VI-NEXT:    v_mov_b32_e32 v33, v8
 ; VI-NEXT:    v_mov_b32_e32 v35, v6
@@ -17988,6 +17980,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
@@ -18005,17 +18005,15 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18046,7 +18044,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -18101,14 +18099,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
 ; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v54
 ; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v53
 ; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v51
 ; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v30
@@ -21934,6 +21932,14 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; SI-NEXT:    v_bfe_u32 v29, v1, 8, 8
 ; SI-NEXT:  .LBB60_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v32
 ; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
@@ -22052,14 +22058,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23918,18 +23916,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v4
 ; SI-NEXT:    v_mov_b32_e32 v31, v2
 ; SI-NEXT:    v_mov_b32_e32 v35, v0
@@ -23943,6 +23929,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v5
@@ -23974,20 +23972,16 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 8, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 8, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v32
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v34
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
@@ -24027,7 +24021,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v30
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v50
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v6
@@ -24105,18 +24099,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v59, v0
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x300, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    s_movk_i32 s6, 0x300
 ; SI-NEXT:    v_or_b32_e32 v0, v58, v0
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v51
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v56, v0
@@ -24232,14 +24225,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v20f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, v10
 ; VI-NEXT:    v_mov_b32_e32 v33, v8
 ; VI-NEXT:    v_mov_b32_e32 v35, v6
@@ -24256,6 +24241,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
@@ -24273,17 +24266,15 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(11)
 ; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24314,7 +24305,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24369,14 +24360,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
 ; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v54
 ; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v53
 ; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v51
 ; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v30
@@ -28252,15 +28243,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
 ; SI-NEXT:    v_mov_b32_e32 v35, v8
 ; SI-NEXT:    v_mov_b32_e32 v34, v6
@@ -28277,6 +28259,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -28295,17 +28286,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28368,7 +28356,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v23
@@ -28508,7 +28496,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
@@ -28557,15 +28545,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v5f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v36, v10
 ; VI-NEXT:    v_mov_b32_e32 v35, v8
 ; VI-NEXT:    v_mov_b32_e32 v34, v6
@@ -28582,6 +28561,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v38, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -28599,17 +28587,14 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28640,7 +28625,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28748,7 +28733,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v8, 3, v50
 ; VI-NEXT:    v_add_u16_e32 v10, 3, v49
 ; VI-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -28780,15 +28765,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v40i8_to_v5f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v6
@@ -28805,6 +28781,16 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -28822,17 +28808,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28863,7 +28849,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28971,7 +28957,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_add_u16_e32 v8, 3, v50
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v49
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32301,15 +32287,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
 ; SI-NEXT:    v_mov_b32_e32 v35, v8
 ; SI-NEXT:    v_mov_b32_e32 v34, v6
@@ -32326,6 +32303,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -32344,17 +32330,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32417,7 +32400,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v23
@@ -32557,7 +32540,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
@@ -32606,15 +32589,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v5i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v36, v10
 ; VI-NEXT:    v_mov_b32_e32 v35, v8
 ; VI-NEXT:    v_mov_b32_e32 v34, v6
@@ -32631,6 +32605,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v38, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -32648,17 +32631,14 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(12)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32689,7 +32669,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -32797,7 +32777,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_add_u16_e32 v8, 3, v50
 ; VI-NEXT:    v_add_u16_e32 v10, 3, v49
 ; VI-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32829,15 +32809,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v40i8_to_v5i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v6
@@ -32854,6 +32825,16 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -32871,17 +32852,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(12)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32912,7 +32893,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -33020,7 +33001,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_add_u16_e32 v8, 3, v50
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v49
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ee23420c2a662..39da45b3e5063 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -8424,6 +8424,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -8459,22 +8475,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8757,6 +8757,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -8790,22 +8806,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9077,9 +9077,25 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -9106,22 +9122,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11440,11 +11440,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -11453,6 +11448,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11484,7 +11484,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -11723,7 +11722,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -11972,11 +11970,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12016,16 +12014,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12035,6 +12026,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12044,11 +12042,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12211,7 +12208,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12221,7 +12218,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12428,11 +12424,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12476,16 +12472,9 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12495,6 +12484,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12504,11 +12500,10 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12671,7 +12666,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12681,7 +12676,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -17323,13 +17317,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -17352,9 +17346,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -23315,6 +23309,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -23350,22 +23360,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23648,6 +23642,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -23681,22 +23691,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23968,9 +23962,25 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -23997,22 +24007,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25440,6 +25434,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_readlane_b32 s67, v63, 19
 ; VI-NEXT:    v_readlane_b32 s66, v63, 18
 ; VI-NEXT:    v_readlane_b32 s65, v63, 17
@@ -25460,7 +25469,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; VI-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25490,21 +25499,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25873,6 +25867,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
 ; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
 ; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
@@ -25889,7 +25898,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25915,21 +25924,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26452,11 +26446,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -26465,6 +26454,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -26496,7 +26490,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -26735,7 +26728,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -26984,11 +26976,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27028,16 +27020,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27047,6 +27032,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -27056,11 +27048,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27223,7 +27214,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27233,7 +27224,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27440,11 +27430,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27488,16 +27478,9 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27507,6 +27490,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -27516,11 +27506,10 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27683,7 +27672,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27693,7 +27682,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -31688,13 +31676,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -31717,9 +31705,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -37714,6 +37702,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -37749,22 +37753,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38047,6 +38035,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -38080,22 +38084,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38367,9 +38355,25 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -38396,22 +38400,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -40740,11 +40728,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -40753,6 +40736,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -40784,7 +40772,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -41023,7 +41010,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -41272,11 +41258,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41316,16 +41302,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41335,6 +41314,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -41344,11 +41330,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41511,7 +41496,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41521,7 +41506,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41728,11 +41712,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41776,16 +41760,9 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41795,6 +41772,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -41804,11 +41788,10 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41971,7 +41954,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41981,7 +41964,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -45317,13 +45299,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -45346,9 +45328,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -51165,6 +51147,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -51200,22 +51198,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51490,6 +51472,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -51523,22 +51521,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51802,9 +51784,25 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -51831,22 +51829,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -54188,11 +54170,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -54201,6 +54178,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -54232,7 +54214,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -54471,7 +54452,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -54720,11 +54700,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -54764,16 +54744,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -54783,6 +54756,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -54792,11 +54772,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -54959,7 +54938,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -54969,7 +54948,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55176,11 +55154,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -55224,16 +55202,9 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -55243,6 +55214,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -55252,11 +55230,10 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -55419,7 +55396,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55429,7 +55406,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -60580,6 +60556,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -60596,8 +60574,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v0
 ; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v1
@@ -60661,9 +60637,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v55
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -64326,18 +64301,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v62
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -64354,6 +64317,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -64471,44 +64446,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v14
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v14
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v13
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v12
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v12
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v11
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v10
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v8
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v8
+; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v7
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v6
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v5
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b32_e32 v50, 24, v16
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v5
 ; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v6
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
@@ -64805,8 +64780,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -64823,6 +64796,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -65094,9 +65069,25 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -65123,22 +65114,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -67243,6 +67218,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
 ; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
 ; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
@@ -67259,7 +67249,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -67285,21 +67275,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -67768,17 +67743,61 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:100
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v23
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v21
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 24, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v27
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr55
@@ -67793,25 +67812,24 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v34
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v35
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 8, v36
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v25
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v39
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 8, v48
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v49
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
@@ -67819,7 +67837,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
@@ -67833,57 +67850,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v23
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v29
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v49
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -67892,7 +67860,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xff, v58
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v26
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
@@ -68173,7 +68140,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB98_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v18
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -68198,7 +68164,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v3, v59, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, s7, v1
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v8
@@ -68222,7 +68187,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v54, vcc, s7, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v57
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -68430,8 +68394,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB98_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -68448,6 +68410,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v37
 ; SI-NEXT:    v_mov_b32_e32 v2, v48
@@ -68458,7 +68422,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v12, v32
 ; SI-NEXT:    v_mov_b32_e32 v14, v51
 ; SI-NEXT:    v_mov_b32_e32 v16, v34
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v18, v52
 ; SI-NEXT:    v_mov_b32_e32 v20, v36
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
@@ -70196,13 +70159,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v46, v30
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
@@ -70219,6 +70181,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
@@ -70242,19 +70205,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v48
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v39
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v37
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v30
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v31
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v34
 ; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -70280,7 +70243,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v60, v44
 ; SI-NEXT:    v_or_b32_e32 v44, v53, v9
 ; SI-NEXT:    v_or_b32_e32 v33, v1, v44
@@ -70725,12 +70688,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB99_3: ; %end
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -70747,6 +70704,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -70758,11 +70721,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
 ; SI-NEXT:    v_mov_b32_e32 v7, s11
 ; SI-NEXT:    v_mov_b32_e32 v8, v37
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_mov_b32_e32 v10, v38
 ; SI-NEXT:    v_mov_b32_e32 v12, v33
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v14, v34
 ; SI-NEXT:    v_mov_b32_e32 v16, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v18, v49
 ; SI-NEXT:    v_mov_b32_e32 v20, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v36
@@ -70770,7 +70735,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v26, v51
 ; SI-NEXT:    v_mov_b32_e32 v28, v54
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
 ; SI-NEXT:    v_mov_b32_e32 v39, v32
@@ -72188,6 +72152,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -72204,8 +72170,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
@@ -72273,9 +72237,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v31
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -76994,8 +76957,24 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v9
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -77023,22 +77002,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -77351,7 +77314,23 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
 ; VI-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77375,22 +77354,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -77663,9 +77626,25 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77692,22 +77671,6 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -79163,13 +79126,12 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    ; implicit-def: $sgpr75
 ; VI-NEXT:    s_branch .LBB105_2
 ; VI-NEXT:  .LBB105_4:
-; VI-NEXT:    v_mov_b32_e32 v1, s58
 ; VI-NEXT:    v_mov_b32_e32 v53, s56
 ; VI-NEXT:    v_mov_b32_e32 v52, s42
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v52, s44
+; VI-NEXT:    v_mov_b32_e32 v1, s58
 ; VI-NEXT:    v_mov_b32_e32 v19, s67
 ; VI-NEXT:    v_mov_b32_e32 v12, s66
 ; VI-NEXT:    v_mov_b32_e32 v20, s65
@@ -79215,6 +79177,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v45, s78
 ; VI-NEXT:    v_mov_b32_e32 v42, s76
 ; VI-NEXT:    v_mov_b32_e32 v55, s74
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v54, s57
 ; VI-NEXT:    v_mov_b32_e32 v41, s59
 ; VI-NEXT:    v_mov_b32_e32 v44, s60
@@ -79320,6 +79283,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v54
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_readlane_b32 s67, v63, 19
 ; VI-NEXT:    v_readlane_b32 s66, v63, 18
 ; VI-NEXT:    v_readlane_b32 s65, v63, 17
@@ -79340,7 +79318,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79372,21 +79350,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -79756,6 +79719,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
 ; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
 ; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
@@ -79772,7 +79750,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(16)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79798,21 +79776,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -80286,6 +80249,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
@@ -80360,19 +80331,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 8, v37
@@ -80390,7 +80352,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v16, v19, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v16
@@ -80403,7 +80365,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v18
 ; SI-NEXT:    v_or_b32_e32 v10, v12, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v10
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xff, v41
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v8
@@ -80428,6 +80389,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v56
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v6
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v46
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v0
@@ -80634,13 +80596,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB106_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v56
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v3, v7
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v47
 ; SI-NEXT:    v_or_b32_e32 v6, v46, v6
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
@@ -80648,12 +80609,10 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v9, v35, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s6, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v39, v7
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, s6, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
@@ -80852,13 +80811,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v1
 ; SI-NEXT:  .LBB106_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -80875,14 +80827,21 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v8, v33
 ; SI-NEXT:    v_mov_b32_e32 v10, v37
 ; SI-NEXT:    v_mov_b32_e32 v12, v49
 ; SI-NEXT:    v_mov_b32_e32 v14, v53
 ; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v18, v34
 ; SI-NEXT:    v_mov_b32_e32 v20, v36
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v22, v38
 ; SI-NEXT:    v_mov_b32_e32 v24, v48
 ; SI-NEXT:    v_mov_b32_e32 v26, v50
@@ -84461,22 +84420,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32
@@ -84542,6 +84485,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v3
@@ -84605,11 +84564,9 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v37
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v48
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v50
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr50
@@ -85220,8 +85177,24 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -85249,22 +85222,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -85820,6 +85777,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -85853,22 +85826,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -86400,21 +86357,10 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v41
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
-; GFX9-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
-; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -86431,6 +86377,17 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
+; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -88045,10 +88002,26 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v54
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v42
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    s_lshl_b32 s4, s76, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v42
 ; SI-NEXT:    v_or_b32_e32 v3, s4, v3
 ; SI-NEXT:    s_and_b32 s4, s74, 0xff
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -88076,22 +88049,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
@@ -88805,6 +88762,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v34
 ; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_readlane_b32 s67, v63, 19
@@ -88827,7 +88799,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -88857,21 +88829,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -90429,6 +90386,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v27
@@ -90458,28 +90417,30 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v17
-; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v20
-; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v28
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v31
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v36
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -90496,8 +90457,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
@@ -90513,16 +90472,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v19
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
 ; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v23
 ; SI-NEXT:    ; kill: killed $vgpr3
@@ -90803,7 +90754,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB110_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v16
@@ -90829,7 +90779,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v58, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, s7, v3
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v60
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v26
@@ -90841,7 +90790,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v46, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, s7, v3
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v59
@@ -90854,7 +90802,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v12, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, s7, v3
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v20
@@ -90868,7 +90815,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, s7, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v28
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v24
@@ -91086,11 +91032,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
 ; SI-NEXT:  .LBB110_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v2, v43
 ; SI-NEXT:    v_mov_b32_e32 v10, v41
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v28, v40
 ; SI-NEXT:    v_mov_b32_e32 v30, v42
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -91109,6 +91052,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v4, v33
 ; SI-NEXT:    v_mov_b32_e32 v6, v39
 ; SI-NEXT:    v_mov_b32_e32 v8, v51
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 5d4df4bde1af8..46911e7934429 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -4938,6 +4938,13 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -5037,13 +5044,6 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6164,6 +6164,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -6180,36 +6188,28 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -6224,14 +6224,12 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12071,6 +12069,13 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -12170,13 +12175,6 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13435,6 +13433,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -13451,36 +13457,28 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -13495,14 +13493,12 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -18420,6 +18416,13 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -18519,13 +18522,6 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -19656,6 +19652,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -19672,36 +19676,28 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -19716,14 +19712,12 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -23978,6 +23972,13 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -24077,13 +24078,6 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25282,6 +25276,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -25298,36 +25300,28 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -25342,14 +25336,12 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -26798,22 +26790,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36i16_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:12
@@ -26838,6 +26814,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -26865,7 +26857,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -26892,7 +26884,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v10
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v12
@@ -26977,7 +26969,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -27147,8 +27138,24 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -27203,22 +27210,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27760,6 +27751,17 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -27842,17 +27844,6 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -28709,6 +28700,12 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v9, v1, v9, 16
 ; SI-NEXT:  .LBB58_2: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v50
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v21
@@ -28810,12 +28807,6 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29421,9 +29412,15 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -29493,12 +29490,6 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 44cfd6c28ca6a..6749daba296c5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3541,6 +3541,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -3562,17 +3573,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -3594,13 +3594,10 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -5557,10 +5554,23 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -5656,19 +5666,6 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11740,6 +11737,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -11761,17 +11769,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -11793,13 +11790,10 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -13756,10 +13750,23 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -13855,19 +13862,6 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14435,6 +14429,10 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:  .LBB33_3: ; %end
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
@@ -14573,9 +14571,6 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -19249,6 +19244,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -19270,17 +19276,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -19302,13 +19297,10 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -21265,10 +21257,23 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -21364,19 +21369,6 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25988,6 +25980,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -26009,17 +26012,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -26041,13 +26033,10 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -27984,10 +27973,23 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -28083,19 +28085,6 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -28635,6 +28624,11 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:  .LBB53_3: ; %end
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
@@ -28773,10 +28767,6 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -31389,6 +31379,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
@@ -31405,17 +31406,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:8
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
@@ -31472,7 +31462,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -31523,7 +31513,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v48
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -31623,7 +31612,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v49, vcc, 3, v49
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -31643,7 +31631,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_add_i32_e32 v48, vcc, 3, v48
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
@@ -31850,7 +31837,23 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -31888,22 +31891,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -32599,11 +32586,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -32620,6 +32602,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -33574,8 +33561,20 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v19, v5, v29, 16
 ; SI-NEXT:  .LBB58_2: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v50
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v40
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v50
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -33690,18 +33689,6 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34417,6 +34404,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v40
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v26
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -34451,18 +34450,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 87d5157b3c340..6b13e96d73999 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3792,6 +3792,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -3814,17 +3825,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -3842,9 +3842,8 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -6118,8 +6117,24 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -6141,22 +6156,6 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12755,6 +12754,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -12777,17 +12787,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -12805,9 +12804,8 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -15081,8 +15079,24 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -15104,22 +15118,6 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15744,6 +15742,15 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
 ; SI-NEXT:  .LBB33_3: ; %end
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
@@ -15896,14 +15903,6 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -21004,6 +21003,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -21026,17 +21036,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -21054,9 +21053,8 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -23330,8 +23328,24 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -23353,22 +23367,6 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -28420,6 +28418,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -28442,17 +28451,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -28470,9 +28468,8 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -30724,8 +30721,24 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -30747,22 +30760,6 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31355,6 +31352,16 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
 ; SI-NEXT:  .LBB53_3: ; %end
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
@@ -31507,15 +31514,6 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -34944,7 +34942,23 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -34982,22 +34996,6 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -35799,11 +35797,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -35820,6 +35813,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -36879,9 +36877,19 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
@@ -36994,16 +37002,6 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -37793,6 +37791,22 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v43
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
@@ -37827,22 +37841,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index fb2e94fc3b87a..034b8027851f4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -4045,6 +4045,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -4069,22 +4085,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -4100,21 +4100,14 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -6622,8 +6615,24 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -6699,22 +6708,6 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8179,6 +8172,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -8195,8 +8190,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -8223,34 +8216,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -13043,6 +13036,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v40
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
@@ -13154,9 +13150,6 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -13882,6 +13875,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -13906,22 +13915,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -13937,21 +13930,14 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -16459,8 +16445,24 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -16536,22 +16538,6 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17233,13 +17219,27 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT:    v_or_b32_e32 v55, v55, v40
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_or_b32_e32 v55, v55, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
@@ -17395,19 +17395,6 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -18157,6 +18144,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -18173,8 +18162,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -18201,34 +18188,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -22982,6 +22969,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -23006,22 +23009,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -23037,21 +23024,14 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -25559,8 +25539,24 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -25636,22 +25632,6 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27128,6 +27108,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -27144,8 +27126,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -27172,34 +27152,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -30384,6 +30364,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v40
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
@@ -30495,9 +30478,6 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -31199,6 +31179,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -31223,22 +31219,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -31254,21 +31234,14 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -33752,8 +33725,24 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -33829,22 +33818,6 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34491,13 +34464,28 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT:    v_or_b32_e32 v55, v55, v40
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_or_b32_e32 v55, v55, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
@@ -34653,20 +34641,6 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -35392,6 +35366,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -35408,8 +35384,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -35436,34 +35410,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -38336,8 +38310,24 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -38524,22 +38514,6 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -39451,14 +39425,8 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -39475,6 +39443,12 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -40754,6 +40728,23 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v56
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v59
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
@@ -40767,7 +40758,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
@@ -40775,22 +40765,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -41255,6 +41229,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -41271,11 +41250,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
@@ -41320,16 +41294,12 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s29
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v32
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v20
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v35
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
@@ -41694,9 +41664,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -41713,6 +41680,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 07cdbef82d892..8b6210d6a817a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -2741,9 +2741,14 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -2855,11 +2860,6 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4341,6 +4341,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -4366,19 +4379,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -4394,17 +4394,12 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -4429,9 +4424,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -4443,10 +4439,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -5032,7 +5027,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -5099,6 +5093,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5231,6 +5226,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -5245,9 +5243,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -5266,6 +5261,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -5294,10 +5293,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7263,11 +7258,6 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -7284,6 +7274,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8051,29 +8046,34 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_or_b32_e32 v43, v43, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v43, vcc, 4, v0
 ; SI-NEXT:    v_or_b32_e32 v41, v41, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    buffer_store_dword v41, v43, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v41, vcc, 8, v0
 ; SI-NEXT:    v_or_b32_e32 v55, v55, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, 12, v0
 ; SI-NEXT:    v_or_b32_e32 v53, v53, v54
@@ -8225,11 +8225,6 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
@@ -9760,7 +9755,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -9827,6 +9821,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9959,6 +9954,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -9973,9 +9971,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -9995,6 +9990,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -10023,10 +10022,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -10295,14 +10290,28 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -10318,22 +10327,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -10342,8 +10335,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -10363,10 +10356,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -10407,11 +10398,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -10425,7 +10416,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -10463,7 +10453,6 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -13356,9 +13345,14 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -13470,11 +13464,6 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14209,6 +14198,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14272,14 +14269,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -15076,6 +15065,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -15101,19 +15103,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -15129,17 +15118,12 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -15164,9 +15148,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -15178,10 +15163,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -15767,7 +15751,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -15834,6 +15817,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15966,6 +15950,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -15980,9 +15967,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -16001,6 +15985,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -16029,10 +16017,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -17998,11 +17982,6 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -18019,6 +17998,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -18938,14 +18922,9 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -18962,6 +18941,11 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -20653,7 +20637,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -20720,6 +20703,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -20852,6 +20836,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -20866,9 +20853,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -20888,6 +20872,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -20916,10 +20904,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -21188,14 +21172,28 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -21211,22 +21209,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -21235,8 +21217,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -21256,10 +21238,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -21300,11 +21280,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -21318,7 +21298,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -21356,7 +21335,6 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -23409,9 +23387,14 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -23523,11 +23506,6 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25023,6 +25001,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -25048,19 +25039,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -25076,17 +25054,12 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -25111,9 +25084,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -25125,10 +25099,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -25714,7 +25687,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -25781,6 +25753,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -25913,6 +25886,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -25927,9 +25903,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -25948,6 +25921,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -25976,10 +25953,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -27946,11 +27919,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -27967,6 +27935,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -28748,29 +28721,34 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_or_b32_e32 v43, v43, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v43, vcc, 4, v0
 ; SI-NEXT:    v_or_b32_e32 v41, v41, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    buffer_store_dword v41, v43, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v41, vcc, 8, v0
 ; SI-NEXT:    v_or_b32_e32 v55, v55, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, 12, v0
 ; SI-NEXT:    v_or_b32_e32 v53, v53, v54
@@ -28922,11 +28900,6 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -30457,7 +30430,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -30524,6 +30496,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -30656,6 +30629,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -30670,9 +30646,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -30692,6 +30665,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -30720,10 +30697,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -30992,14 +30965,28 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -31015,22 +31002,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -31039,8 +31010,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -31060,10 +31031,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -31104,11 +31073,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -31122,7 +31091,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -31160,7 +31128,6 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -32398,9 +32365,14 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -32512,11 +32484,6 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -33212,6 +33179,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -33275,14 +33250,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -34053,6 +34020,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -34078,19 +34058,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -34106,17 +34073,12 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -34141,9 +34103,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -34155,10 +34118,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -34744,7 +34706,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -34811,6 +34772,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -34943,6 +34905,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -34957,9 +34922,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -34978,6 +34940,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -35006,10 +34972,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -36949,11 +36911,6 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -36970,6 +36927,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -37850,14 +37812,9 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -37874,6 +37831,11 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -39539,7 +39501,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -39606,6 +39567,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -39738,6 +39700,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -39752,9 +39717,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -39774,6 +39736,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -39802,10 +39768,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -40074,14 +40036,28 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -40097,22 +40073,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -40121,8 +40081,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -40142,10 +40102,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -40186,11 +40144,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -40204,7 +40162,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -40242,7 +40199,6 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -41870,8 +41826,24 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -42102,22 +42074,6 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -43338,23 +43294,7 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v58
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -43371,6 +43311,22 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -44684,9 +44640,25 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v62
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
@@ -44714,22 +44686,6 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -45248,6 +45204,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -45264,15 +45229,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
@@ -45317,26 +45273,19 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, s29
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v32
-; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v33
-; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v28
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v36
-; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v38
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v39
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
@@ -45729,9 +45678,25 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -45759,22 +45724,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 8eb71e90f8504..09cf27810a5c9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -2928,9 +2928,18 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -3030,15 +3039,6 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4665,6 +4665,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -4694,11 +4699,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -4715,9 +4715,8 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -5413,7 +5412,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -5486,6 +5484,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5634,6 +5633,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -5648,9 +5650,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -5669,6 +5668,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -5697,10 +5700,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7830,21 +7829,7 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -7861,6 +7846,20 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8743,6 +8742,15 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v53, v55, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v53, vcc, 24, v0
@@ -8888,15 +8896,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
@@ -10560,7 +10559,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -10633,6 +10631,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -10781,6 +10780,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -10795,9 +10797,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -10817,6 +10816,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -10845,10 +10848,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11148,7 +11147,20 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -11156,7 +11168,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -11188,19 +11199,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -11217,11 +11215,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -11299,6 +11297,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -11317,7 +11316,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -11585,7 +11583,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -14434,9 +14431,18 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14536,15 +14542,6 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15382,9 +15379,21 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15412,18 +15421,6 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -16290,6 +16287,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -16319,11 +16321,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -16340,9 +16337,8 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -17038,7 +17034,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -17111,6 +17106,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -17259,6 +17255,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -17273,9 +17272,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -17294,6 +17290,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -17322,10 +17322,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -19455,21 +19451,7 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -19486,6 +19468,20 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -20473,28 +20469,12 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v43
 ; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v43
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v57
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v41
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -20511,6 +20491,22 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -22343,7 +22339,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -22416,6 +22411,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -22564,6 +22560,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -22578,9 +22577,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -22600,6 +22596,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -22628,10 +22628,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -22931,7 +22927,20 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -22939,7 +22948,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -22971,19 +22979,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -23000,11 +22995,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -23082,6 +23077,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -23100,7 +23096,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -23368,7 +23363,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -25329,9 +25323,18 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -25431,15 +25434,6 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27080,6 +27074,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -27109,11 +27108,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -27130,9 +27124,8 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -27828,7 +27821,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -27901,6 +27893,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28049,6 +28042,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -28063,9 +28059,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -28084,6 +28077,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -28112,10 +28109,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -30245,21 +30238,7 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -30276,6 +30255,20 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31172,6 +31165,15 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v53, v55, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v53, vcc, 24, v0
@@ -31317,15 +31319,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -32989,7 +32982,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -33062,6 +33054,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -33210,6 +33203,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -33224,9 +33220,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -33246,6 +33239,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -33274,10 +33271,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -33577,7 +33570,20 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -33585,7 +33591,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -33617,19 +33622,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -33646,11 +33638,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -33728,6 +33720,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -33746,7 +33739,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -34014,7 +34006,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -35112,9 +35103,18 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -35214,15 +35214,6 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -36018,9 +36009,21 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -36048,18 +36051,6 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -36898,6 +36889,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -36927,11 +36923,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -36948,9 +36939,8 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -37646,7 +37636,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -37719,6 +37708,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -37867,6 +37857,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -37881,9 +37874,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -37902,6 +37892,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -37930,10 +37924,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -40041,14 +40031,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -40065,6 +40048,13 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -41035,11 +41025,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v21
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -41056,6 +41041,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v21
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -42860,7 +42850,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -42933,6 +42922,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -43081,6 +43071,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -43095,9 +43088,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -43117,6 +43107,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -43145,10 +43139,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -43448,7 +43438,20 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -43456,7 +43459,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43488,19 +43490,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -43517,11 +43506,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -43599,6 +43588,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -43617,7 +43607,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -43885,7 +43874,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -45395,8 +45383,24 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -45662,22 +45666,6 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -47041,6 +47029,22 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -47061,22 +47065,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -48513,6 +48501,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v60
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
@@ -48535,22 +48539,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -49691,6 +49679,22 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -49712,22 +49716,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 93c11f13ce3ce..4175d5f2de73d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -3108,9 +3108,22 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -3186,19 +3199,6 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5032,40 +5032,53 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -5096,27 +5109,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5201,7 +5197,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -5346,7 +5341,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -5494,7 +5488,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v30i32:
@@ -5776,7 +5770,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -5855,6 +5848,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6019,6 +6013,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -6033,9 +6030,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -6054,6 +6048,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -6082,10 +6080,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -8387,11 +8381,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -8408,6 +8397,11 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9393,11 +9387,24 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 44, v0
 ; SI-NEXT:    v_or_b32_e32 v36, v38, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v36, v39, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 48, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT:    v_add_i32_e32 v37, vcc, 48, v0
 ; SI-NEXT:    v_or_b32_e32 v34, v36, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
@@ -9519,19 +9526,6 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
@@ -10345,6 +10339,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -10373,23 +10370,12 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -10399,8 +10385,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -10422,9 +10406,18 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -10434,6 +10427,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -10471,7 +10465,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -10486,6 +10479,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -11357,7 +11351,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -11436,6 +11429,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -11600,6 +11594,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -11614,9 +11611,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -11636,6 +11630,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -11664,10 +11662,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11988,12 +11982,35 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -12003,7 +12020,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12012,7 +12029,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12021,7 +12038,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -12032,38 +12049,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -12088,12 +12079,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -12202,12 +12193,10 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB19_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -15531,9 +15520,22 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15609,19 +15611,6 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16552,12 +16541,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -16573,6 +16557,11 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -17570,40 +17559,53 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -17634,27 +17636,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17739,7 +17724,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -17884,7 +17868,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -18032,7 +18015,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v30f32:
@@ -18314,7 +18297,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -18393,6 +18375,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18557,6 +18540,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -18571,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -18592,6 +18575,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -18620,10 +18607,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -20925,11 +20908,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -20946,6 +20924,11 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -21861,7 +21844,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v46
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
@@ -22028,24 +22011,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v38
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x70, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -22062,6 +22028,23 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v54
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v38
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x70, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -23044,6 +23027,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -23072,23 +23058,12 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -23098,8 +23073,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -23121,9 +23094,18 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -23133,6 +23115,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -23170,7 +23153,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -23185,6 +23167,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -24056,7 +24039,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -24135,6 +24117,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24299,6 +24282,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -24313,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -24335,6 +24318,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -24363,10 +24350,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -24687,50 +24670,6 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
@@ -24755,14 +24694,55 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -24787,12 +24767,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -24901,12 +24881,10 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB35_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -27300,9 +27278,22 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -27378,19 +27369,6 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29240,40 +29218,53 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -29304,27 +29295,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -29409,7 +29383,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -29554,7 +29527,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -29702,7 +29674,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v15i64:
@@ -29984,7 +29956,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -30063,6 +30034,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -30227,6 +30199,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -30241,9 +30216,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -30262,6 +30234,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -30290,10 +30266,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -32596,11 +32568,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -32617,6 +32584,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -33618,11 +33590,24 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 44, v0
 ; SI-NEXT:    v_or_b32_e32 v36, v38, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v36, v39, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 48, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT:    v_add_i32_e32 v37, vcc, 48, v0
 ; SI-NEXT:    v_or_b32_e32 v34, v36, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
@@ -33744,19 +33729,6 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -34570,6 +34542,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -34598,23 +34573,12 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -34624,8 +34588,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -34647,9 +34609,18 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -34659,6 +34630,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -34696,7 +34668,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -34711,6 +34682,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -35582,7 +35554,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -35661,6 +35632,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -35825,6 +35797,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -35839,9 +35814,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -35861,6 +35833,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -35889,10 +35865,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -36213,12 +36185,35 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -36228,7 +36223,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36237,7 +36232,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36246,7 +36241,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -36257,38 +36252,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -36313,12 +36282,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -36427,12 +36396,10 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB47_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -37922,9 +37889,22 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -38000,19 +37980,6 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38899,12 +38866,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -38921,6 +38883,11 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -39888,40 +39855,53 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -39952,27 +39932,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -40057,7 +40020,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -40202,7 +40164,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -40350,7 +40311,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v15f64:
@@ -40632,7 +40593,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -40711,6 +40671,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -40875,6 +40836,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -40889,9 +40853,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -40910,6 +40871,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -40938,10 +40903,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -43173,8 +43134,24 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v62
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -43218,22 +43195,6 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -44289,11 +44250,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -44310,6 +44266,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -45262,6 +45223,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -45290,23 +45254,12 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -45316,8 +45269,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -45339,9 +45290,18 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -45351,6 +45311,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -45388,7 +45349,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -45403,6 +45363,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46274,7 +46235,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -46353,6 +46313,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -46517,6 +46478,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -46531,9 +46495,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -46553,6 +46514,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -46581,10 +46546,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -46905,50 +46866,6 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
@@ -46973,14 +46890,55 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -47005,12 +46963,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -47119,12 +47077,10 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB55_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -49301,7 +49257,23 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v47
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -49339,22 +49311,6 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -50856,6 +50812,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v50
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -50876,22 +50848,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51893,27 +51849,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
@@ -52448,18 +52404,7 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -52476,6 +52421,17 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -53259,6 +53215,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -53285,10 +53243,13 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v51, v11
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -53300,8 +53261,26 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v48
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
@@ -53329,17 +53308,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v18, v3, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v51, v11
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -53382,52 +53355,32 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
 ; SI-NEXT:    v_lshr_b64 v[58:59], v[34:35], 16
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
-; SI-NEXT:    v_or_b32_e32 v6, v3, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v31
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
 ; SI-NEXT:    v_mov_b32_e32 v59, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v60
-; SI-NEXT:    v_or_b32_e32 v4, v3, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT:    v_or_b32_e32 v6, v3, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v8
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v31
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v60
+; SI-NEXT:    v_or_b32_e32 v4, v3, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v39
+; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v20
@@ -53524,14 +53477,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
 ; SI-NEXT:    v_lshr_b64 v[40:41], v[21:22], 16
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[11:12], 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[56:57], 16
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v11, v24
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
 ; SI-NEXT:    v_mov_b32_e32 v39, v31
 ; SI-NEXT:    v_mov_b32_e32 v31, v60
@@ -53541,7 +53495,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v37, v55
 ; SI-NEXT:    v_lshr_b64 v[55:56], v[5:6], 16
 ; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
 ; SI-NEXT:  .LBB59_3: ; %end
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v58
@@ -53662,15 +53615,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
@@ -53681,9 +53634,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -53693,11 +53648,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
@@ -53722,8 +53675,24 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
@@ -53748,22 +53717,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 30ad46d959b7e..b6b59d809306a 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
 ; GFX8-NEXT:    s_movk_i32 s4, 0x70
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
 ; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
-; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9552,6 +9552,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 36, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
 ; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -9563,7 +9564,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
 ; GFX8-NEXT:    flat_load_ushort v44, v[1:2]
 ; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 40, v1
@@ -9686,6 +9686,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v51
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
+; GFX8-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xb8, v0
 ; GFX8-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xb4, v0
@@ -9816,17 +9827,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
 ; GFX8-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 68313807c427f..04f8ad8a02303 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -450,23 +450,38 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -976,23 +991,38 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -1159,24 +1189,23 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s9, s12
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s6, s3
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s8, s1
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; SDAG-GFX1100-NEXT:    s_clause 0x1
 ; SDAG-GFX1100-NEXT:    s_load_b32 s13, s[4:5], 0x54
 ; SDAG-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s5, s12
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX1100-NEXT:    s_mov_b32 s4, s3
 ; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
+; SDAG-GFX1100-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s2, s1
-; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0)
@@ -1220,12 +1249,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 8e12e7e03947b..832e43f1e1973 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4253,6 +4253,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4260,7 +4261,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
 ; VI-NEXT:    s_add_u32 s36, s36, s3
@@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; VI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; VI-NEXT:    s_endpgm
@@ -4285,6 +4285,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4292,7 +4293,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
 ; CI-NEXT:    s_add_u32 s36, s36, s3
@@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; CI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    s_waitcnt vmcnt(6)
 ; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; CI-NEXT:    s_endpgm
@@ -4317,6 +4317,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4324,7 +4325,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s3
@@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 0cae0e51107df..5cc68451d5ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b8 v0, v1 offset:9
+; CI-NEXT:    ds_write_b8 v0, v2 offset:13
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; CI-NEXT:    ds_write_b8 v0, v1 offset:5
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; CI-NEXT:    ds_write_b8 v0, v1 offset:9
-; CI-NEXT:    ds_write_b8 v0, v2 offset:13
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
index 683887b0a55f3..a4b3a8544dede 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
@@ -476,7 +476,6 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
 ; GCN-NEXT:    v_accvgpr_write_b32 a31, v19 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a30, v20 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a29, v21 ; Reload Reuse
@@ -489,6 +488,7 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_write_b32 a22, v28 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a21, v29 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a20, v30 ; Reload Reuse
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -1029,7 +1029,6 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
 ; GCN-NEXT:    v_accvgpr_write_b32 a31, v21 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a30, v22 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a29, v23 ; Reload Reuse
@@ -1040,6 +1039,7 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_write_b32 a24, v28 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a23, v29 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a22, v30 ; Reload Reuse
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 5fb50d0d89530..da08f4fcf8f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3755,42 +3755,44 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v22
 ; CI-NEXT:    v_or_b32_e32 v10, v14, v10
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
 ; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; CI-NEXT:    v_or_b32_e32 v17, v18, v17
 ; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_or_b32_e32 v17, v18, v17
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v27
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; CI-NEXT:    v_or_b32_e32 v13, v16, v13
 ; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; CI-NEXT:    v_or_b32_e32 v19, v20, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v21, v30
 ; CI-NEXT:    v_or_b32_e32 v20, v22, v20
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v29
-; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; CI-NEXT:    v_or_b32_e32 v21, v22, v21
 ; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    s_waitcnt vmcnt(5)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    s_waitcnt vmcnt(4)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
@@ -3802,6 +3804,27 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT:    v_or_b32_e32 v14, v15, v14
+; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; CI-NEXT:    v_or_b32_e32 v12, v12, v15
+; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
+; CI-NEXT:    v_or_b32_e32 v11, v16, v11
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3968,28 +3991,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_or_b32_e32 v31, v32, v31
 ; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x48, v0
 ; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT:    v_or_b32_e32 v14, v15, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; CI-NEXT:    v_or_b32_e32 v12, v12, v15
-; CI-NEXT:    v_or_b32_e32 v11, v16, v11
-; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
 ; CI-NEXT:    buffer_store_dword v11, v15, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
 ; CI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 279f4298e6418..590d69b8eb869 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -1,6 +1,19 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
 
+--- |
+
+  @foo = addrspace(3) global i32 poison
+
+  define void @test_overlap() { unreachable }
+  define void @test_dead_redef() { unreachable }
+  define void @test_tied() { unreachable }
+  define void @test_mmo_merge1() { unreachable }
+  define void @test_mmo_merge2() { unreachable }
+  define void @test_mmo_drop() { unreachable }
+
+...
+
 ---
 name: test_overlap
 body: |
@@ -47,3 +60,42 @@ body: |
     %1:vgpr_32 = COPY %0:vgpr_32
     %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
 ...
+
+---
+name: test_mmo_merge1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mmo_merge1
+    ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
+    ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    ; CHECK-NEXT: }
+    %1:vgpr_32 = COPY %0:vgpr_32
+    DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+...
+
+---
+name: test_mmo_merge2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mmo_merge2
+    ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+    ; CHECK-NEXT: }
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
+...
+
+---
+name: test_mmo_drop
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mmo_drop
+    ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+    ; CHECK-NEXT: }
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
+    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index b750d28ffa7d3..d43c6ba322619 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    s_clause 0x1f
+; GFX10-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v95, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:8
@@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s1, return_100xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, return_100xi32@abs32@lo
 ; GFX11-NEXT:    s_addk_i32 s32, 0x90
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:124
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:120
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:116
@@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s33
 ; GFX11-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v95, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v94, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v93, off, s33 offset:8
@@ -2416,7 +2416,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:148
 ; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:156
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
 ; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
 ; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
 ; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
@@ -2459,7 +2458,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_clause 0x8
+; GFX10-NEXT:    s_clause 0x7
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
@@ -2468,6 +2467,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
 ; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    s_clause 0xc ; 52-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:204
@@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    s_clause 0x5
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
 ; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
 ; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    s_clause 0x10
 ; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
 ; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
 ; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    s_clause 0xd
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
@@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    s_clause 0xc ; 52-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:172
@@ -2641,21 +2641,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2733,6 +2718,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
@@ -2914,21 +2914,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_mov_b32 s38, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2971,12 +2957,11 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
@@ -3006,9 +2991,24 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX10-NEXT:    s_mov_b32 s37, return_72xi32@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s36, return_72xi32@abs32@lo
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    s_clause 0x28
+; GFX10-NEXT:    s_clause 0x3e
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
@@ -3050,30 +3050,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:792
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:524
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:528
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:532
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:536
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:540
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:544
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_clause 0x15
 ; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
 ; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
 ; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:556
@@ -3096,6 +3072,29 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:624
 ; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:628
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:632
+; GFX10-NEXT:    s_waitcnt vmcnt(22)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:524
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:528
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:532
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:536
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:540
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:544
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
 ; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s32
@@ -3138,7 +3137,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
-; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544
@@ -3151,7 +3150,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    s_clause 0xe
+; GFX10-NEXT:    s_clause 0xe ; 60-byte Folded Reload
 ; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8
@@ -3199,7 +3198,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_mov_b32 s36, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    s_clause 0xb ; 48-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:40
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:36
@@ -3341,18 +3340,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_add_i32 s2, s32, 16
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584
+; GFX11-NEXT:    s_clause 0x3 ; 64-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1568
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1552
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1536
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    s_clause 0xb ; 48-byte Folded Reload
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index f80716939f618..93d7eeb085107 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
 ; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 48
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 32
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[64:65], 16
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[66:67], 0
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0x50
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 64
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index 7e1055b2a28a4..03b56cad85dac 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -11,7 +11,7 @@ body: |
     ; CHECK-LABEL: name: mimg_nsa
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
     ; CHECK-NEXT:   S_CLAUSE 1
     ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -29,7 +29,7 @@ body: |
     ; CHECK-LABEL: name: mimg_nsa_mixed
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
     ; CHECK-NEXT:   S_CLAUSE 2
     ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
index 9689dda9932ed..68f9e839012c3 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -10,7 +10,7 @@ body: |
     ; CHECK-LABEL: name: mimg
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
     ; CHECK-NEXT:   S_CLAUSE 1
     ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -28,7 +28,7 @@ body: |
     ; CHECK-LABEL: name: mimg_mixed
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
     ; CHECK-NEXT:   S_CLAUSE 2
     ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
index 4719ab9090fa5..cbf697fafe683 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
@@ -1,13 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
 
 
-; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
-; MIR-NEXT: S_WAITCNT 0
-; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+  ; MIR-LABEL: name: gws_barrier_offset0
+  ; MIR: bb.0 (%ir-block.0):
+  ; MIR-NEXT:   liveins: $sgpr8_sgpr9
+  ; MIR-NEXT: {{  $}}
+  ; MIR-NEXT:   renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
+  ; MIR-NEXT:   $m0 = S_MOV_B32 0
+  ; MIR-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
+  ; MIR-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
+  ; MIR-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+  ; MIR-NEXT:     S_WAITCNT 0
+  ; MIR-NEXT:   }
+  ; MIR-NEXT:   S_ENDPGM 0
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
   ret void
 }
@@ -17,5 +24,3 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent inaccessiblememonly nounwind }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index c5f6e2b0098ae..417b8e08cf669 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -35,7 +35,7 @@
 ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
+; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
 ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index 4419b8c6f9862..af270e5adf75c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-FAKE16-NEXT:    v_dot2_bf16_bf16 v1, s2, s3, v1
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 0194d25a99cdc..72b47693c69f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
@@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GISEL-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GISEL-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0c1448a0b8fb6..1d08097452ce6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,21 +17,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s16
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -43,13 +41,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
@@ -175,16 +172,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
@@ -207,16 +203,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
@@ -520,21 +515,19 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
 ; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s16
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -634,16 +627,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
@@ -802,11 +794,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -815,7 +807,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -833,12 +824,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -965,15 +955,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -1003,15 +992,14 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -1317,11 +1305,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1330,7 +1318,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1348,12 +1335,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1481,11 +1467,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1494,7 +1480,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1512,12 +1497,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1645,11 +1629,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1658,7 +1642,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1676,12 +1659,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1809,11 +1791,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1822,7 +1804,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1840,12 +1821,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1972,15 +1952,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2010,15 +1989,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2323,15 +2301,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2361,15 +2338,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2674,15 +2650,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2712,15 +2687,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -3025,15 +2999,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -3063,15 +3036,14 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index f93e5f06beff9..83c240c17ff1c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -10386,7 +10386,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x150
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[12:15]
-; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[16:19]
 ; GFX8-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x140
@@ -10395,10 +10396,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v14, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x130
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x120
@@ -10406,20 +10403,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x110
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v42, vcc_lo
 ; GFX8-NEXT:    v_mov_b32_e32 v43, vcc_hi
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[20:23]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[24:27]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s15
 ; GFX8-NEXT:    flat_store_dwordx4 v[56:57], v[28:31]
@@ -10588,6 +10586,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[24:27]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index bca39d06e941c..59f4a9d44bbdd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v22, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v20, v2, 0, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e55fb2cac0985..7203545ebf9a8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v6
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v6
@@ -3726,7 +3726,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
-; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
 ; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
 ; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
@@ -3740,7 +3739,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v43, v26
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v29, v27
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v3
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v2
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v1
@@ -3749,6 +3748,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v1
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v57, v2
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v59, v3
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v24
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v23
@@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v21
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, v21
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index f879dc660203f..cb17f01853221 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -7788,19 +7788,18 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v29, v13, 16, 8
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v35, v14, 8, 8
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v33, 0xff, v14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v53
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v16, v17, 8, 8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v53
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xff, v17
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v17
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v52, v17, 16, 8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v53
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
@@ -7810,7 +7809,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index bd191a37582c0..062a985dd7180 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3172,27 +3172,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
 ; VI-NO-DS128:       ; %bb.0:
 ; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v16 offset1:1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
 ; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
-; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
@@ -3200,7 +3198,6 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v18
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v17
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v20
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v19
@@ -3243,17 +3240,19 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v55, 0xffff, v19
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v20
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
-; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v18
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, s0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
@@ -3296,21 +3295,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v56, s1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v56 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
-; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    s_nop 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
@@ -3337,9 +3332,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v22
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v40, 0xffff, v17
@@ -3360,16 +3357,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v54, 0xffff, v22
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v63, 16, v23
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v62, 0xffff, v23
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v19
@@ -3806,9 +3804,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
 ; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
 ; VI-DS128-NEXT:    v_mov_b32_e32 v4, v3
 ; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v11
@@ -3825,23 +3825,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
-; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
-; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
-; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
 ; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
@@ -3850,21 +3843,25 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v22
 ; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
 ; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v20
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
 ; VI-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
 ; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
 ; VI-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
@@ -3875,16 +3872,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
 ; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
 ; VI-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
 ; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
-; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; VI-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; VI-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
@@ -3943,9 +3941,11 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v0
 ; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
 ; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v11
@@ -3964,24 +3964,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
-; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
-; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_nop 0
-; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
-; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
@@ -3990,21 +3982,26 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v22
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v20
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
 ; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
 ; GFX9-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
@@ -4015,16 +4012,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
 ; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
-; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
@@ -4197,29 +4195,20 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
 ; VI-NO-DS128:       ; %bb.0:
 ; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
 ; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
@@ -4229,7 +4218,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
@@ -4247,16 +4236,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
-; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
@@ -4316,23 +4313,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
 ; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
 ; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    s_nop 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
@@ -4342,7 +4330,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
@@ -4360,16 +4348,24 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
@@ -4857,10 +4853,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_mov_b32_e32 v32, s1
 ; VI-DS128-NEXT:    ds_read_b128 v[8:11], v32
 ; VI-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
 ; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -4873,12 +4871,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v8
 ; VI-DS128-NEXT:    v_bfe_i32 v5, v9, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v3, v8, 0, 16
-; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v19
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v18
@@ -4899,8 +4891,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
 ; VI-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
 ; VI-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v25
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
@@ -4913,14 +4908,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
 ; VI-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
 ; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; VI-DS128-NEXT:    v_bfe_i32 v22, v25, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v20, v24, 0, 16
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
 ; VI-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
@@ -4985,9 +4981,11 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s1
 ; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v32
 ; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
 ; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -5001,13 +4999,6 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v8
 ; GFX9-DS128-NEXT:    v_bfe_i32 v5, v9, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v3, v8, 0, 16
-; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_nop 0
-; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v19
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v18
@@ -5028,8 +5019,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
 ; GFX9-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
 ; GFX9-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v25
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
@@ -5042,14 +5037,15 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
 ; GFX9-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; GFX9-DS128-NEXT:    v_bfe_i32 v22, v25, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v20, v24, 0, 16
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
 ; GFX9-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 1d1d3e4a68fee..9da7a79ba2fdf 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -15,24 +15,23 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s4, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
-; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -63,10 +62,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s4, s3
@@ -100,25 +99,24 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s4, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
-; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
@@ -141,24 +139,23 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s4, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
+; GFX12-NEXT:    s_mov_b32 s4, s3
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
-; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index fc36ed939d91d..84db54c2d537f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,24 +181,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -215,12 +214,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -239,24 +238,23 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -273,12 +271,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -413,11 +411,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -468,25 +466,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
@@ -503,13 +500,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -528,25 +525,24 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
-; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
@@ -563,13 +559,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 4ab05c2923fdb..c1f4d7bbf650e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,12 +19,12 @@ $_f2 = comdat any
 define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    ds_write_b8 v1, v0
 ; GCN-NEXT:    ds_read_u8 v2, v1 offset:2
 ; GCN-NEXT:    ds_read_u16 v3, v1
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-NEXT:    ds_write_b8 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b8 v1, v2 offset:6
 ; GCN-NEXT:    ds_write_b16 v1, v3 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index 24c1bfb8d50f0..ccfd45bc87e71 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -75,15 +75,15 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i
 ; GCN-LABEL: no_clobber_ds_load_stores_x3:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 2
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    ds_write_b32 v1, v2 offset:256
+; GCN-NEXT:    ds_write_b32 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 2
-; GCN-NEXT:    v_mov_b32_e32 v2, 3
-; GCN-NEXT:    ds_write_b32 v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ds_write_b32 v1, v2 offset:256
+; GCN-NEXT:    v_mov_b32_e32 v2, 3
 ; GCN-NEXT:    ds_write_b32 v1, v2 offset:512
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v3, v0 offset:256
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index ae0805448d693..ba532949a687d 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GFX1250-NEXT:    s_load_b32 s6, s[0:1], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_max_u32_e32 v0, s6, v0
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 02f39e25cb447..af7ca0fb59682 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v4 offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB0_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB1_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1854,6 +1854,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
@@ -1862,10 +1866,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
@@ -1901,14 +1901,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
@@ -1923,6 +1915,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
@@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB3_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:  .LBB4_1: ; %load-store-loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
 ; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
@@ -3797,57 +3797,96 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_clause 0x33
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
-; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
-; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
@@ -3856,76 +3895,83 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v35
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v50, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v64, 8, v54
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v65
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
@@ -3934,52 +3980,82 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
 ; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(38)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(36)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    v_lshl_or_b32 v73, v13, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
@@ -4251,260 +4327,133 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v102, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v9, 8, v10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v127, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v113, 8, v116
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v123, 8, v125
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v110
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v93, 8, v94
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v104
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v108
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v90
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v88
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v76
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v60, 8, v62
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v57
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v47, 8, v58
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v44
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v119, 8, v41
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v118
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3e
-; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    s_clause 0x6
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v113, 8, v116
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v102, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v106, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v103, 8, v114
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v100, 8, v112
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v123, 8, v125
 ; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT:    s_waitcnt vmcnt(60)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v97, 8, v98
-; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v93, 8, v94
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v87, 8, v96
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
-; ALIGNED-NEXT:    v_lshl_or_b32 v73, v13, 8, v16
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v9, 8, v10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v110
 ; ALIGNED-NEXT:    v_lshl_or_b32 v89, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v84, 8, v86
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v85, 8, v83
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v104
 ; ALIGNED-NEXT:    v_lshl_or_b32 v74, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v81
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v108
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v65, 8, v71
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v88
 ; ALIGNED-NEXT:    v_lshl_or_b32 v46, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v80
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v54, 8, v53
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v90
 ; ALIGNED-NEXT:    v_lshl_or_b32 v117, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v55, 8, v70
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v52, 8, v64
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v76
 ; ALIGNED-NEXT:    v_lshl_or_b32 v115, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v50
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v48
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v57
 ; ALIGNED-NEXT:    v_lshl_or_b32 v99, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v38
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v60, 8, v62
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v32, 8, v33
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v82, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v37
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v31, 8, v34
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v47, 8, v58
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v66, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v29, 8, v30
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v26, 8, v28
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v44
 ; ALIGNED-NEXT:    v_lshl_or_b32 v51, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v25, 8, v21
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
 ; ALIGNED-NEXT:    v_lshl_or_b32 v49, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v18, 8, v20
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v14, 8, v15
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v118
 ; ALIGNED-NEXT:    v_lshl_or_b32 v27, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v17, 8, v19
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v119, 8, v41
 ; ALIGNED-NEXT:    v_lshl_or_b32 v22, v73, 16, v4
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v77, 16, v73
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v6, 8, v8
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v7, 8, v5
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v77, 16, v73
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1
 ; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill
@@ -4513,37 +4462,34 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v109, 8, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v1, 8, v120
+; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v73, 8, v1
+; ALIGNED-NEXT:    v_mov_b32_e32 v1, v107
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v107, 8, v0
-; ALIGNED-NEXT:    v_mov_b32_e32 v1, v107
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v120, 8, v122
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v121, 8, v109
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16
@@ -4553,6 +4499,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:228
 ; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:224
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v126, 8, v77
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -4625,6 +4572,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v83 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v85 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v84 offset:201
@@ -4641,21 +4589,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v112 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v114 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:192
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
-; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v119 offset:185
@@ -4672,18 +4605,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v58 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v61 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:176
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v63 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v72 offset:169
@@ -4700,6 +4621,36 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v104 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v108 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v110 offset:160
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v127 offset:159
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
@@ -4712,10 +4663,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:154
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:153
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v127 offset:159
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:157
@@ -5181,6 +5128,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:388
 ; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:384
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:11
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:26
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
@@ -5234,8 +5183,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:10
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:11
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
@@ -5274,7 +5221,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB4_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -6797,7 +6744,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB5_5
 ; ALIGNED-NEXT:  .LBB5_6: ; %Flow6
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8296,7 +8243,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB6_5
 ; ALIGNED-NEXT:  .LBB6_6: ; %Flow8
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8848,14 +8795,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v6
@@ -8871,6 +8810,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v23
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v7
@@ -9297,6 +9244,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:476
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:468
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
@@ -9305,10 +9256,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
@@ -9344,14 +9291,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:304
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
@@ -9366,6 +9305,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
@@ -12198,7 +12145,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB8_5
 ; ALIGNED-NEXT:  .LBB8_6: ; %Flow19
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -12645,6 +12592,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-LABEL: memmove_p0_p5_sz2048:
 ; ALIGNED:       ; %bb.0: ; %entry
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
 ; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12693,34 +12645,29 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
 ; ALIGNED-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; ALIGNED-NEXT:    v_cmpx_ge_u32_e64 v2, v0
 ; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; ALIGNED-NEXT:    s_cbranch_execz .LBB9_2
 ; ALIGNED-NEXT:  .LBB9_1: ; %memmove_fwd_loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
 ; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12742,17 +12689,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
 ; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
 ; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12760,58 +12707,94 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_clause 0x30
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
 ; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
 ; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v27, 8, v25
-; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
@@ -12819,82 +12802,81 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v32, 8, v34
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v36, 8, v31
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v48, 8, v37
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v50, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v51, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v54, 8, v53
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v55, 8, v65
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v64
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v67
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
@@ -12902,47 +12884,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0xc
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
 ; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(32)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
+; ALIGNED-NEXT:    v_lshl_or_b32 v95, v16, 8, v20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
+; ALIGNED-NEXT:    v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13214,289 +13246,158 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v114, 8, v113
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v117, 8, v40
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v121
+; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v115, 8, v118
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v112, 8, v116
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v122, 8, v123
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v121
+; ALIGNED-NEXT:    v_lshl_or_b32 v93, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v101, 8, v102
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v105, 8, v104
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v99, 8, v100
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v108, 8, v120
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v105, 8, v104
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v98
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v97, 8, v87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v94, 8, v107
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v85
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v106, 8, v111
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v83
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v88, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v84
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v66, 8, v65
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v89, 8, v92
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v88, 8, v90
+; ALIGNED-NEXT:    v_lshl_or_b32 v41, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v82
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v75
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v64, 8, v68
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 8, v79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v75
+; ALIGNED-NEXT:    v_lshl_or_b32 v119, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v61
+; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v50
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v61
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v36, 8, v37
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v86, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v49
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v35, 8, v38
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v59, 8, v62
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v63, 8, v73
+; ALIGNED-NEXT:    v_lshl_or_b32 v70, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v34
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v32
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v56
+; ALIGNED-NEXT:    v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v25
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v47, 8, v57
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v56
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v44, 8, v42
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3e
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    s_clause 0x5
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v117, 8, v40
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v114, 8, v113
-; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v115, 8, v118
-; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v112, 8, v116
-; ALIGNED-NEXT:    v_lshl_or_b32 v93, v4, 16, v3
-; ALIGNED-NEXT:    s_waitcnt vmcnt(59)
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v101, 8, v102
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v99, 8, v100
-; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
-; ALIGNED-NEXT:    v_lshl_or_b32 v95, v16, 8, v20
-; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
-; ALIGNED-NEXT:    v_lshl_or_b32 v109, v11, 8, v12
-; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v98
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v97, 8, v87
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v85
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v83
-; ALIGNED-NEXT:    v_lshl_or_b32 v58, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v84
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v66, 8, v65
-; ALIGNED-NEXT:    v_lshl_or_b32 v41, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v82
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v64, 8, v68
-; ALIGNED-NEXT:    v_lshl_or_b32 v119, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v52
-; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v50
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v36, 8, v37
-; ALIGNED-NEXT:    v_lshl_or_b32 v86, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v49
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v35, 8, v38
-; ALIGNED-NEXT:    v_lshl_or_b32 v70, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v34
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v32
-; ALIGNED-NEXT:    v_lshl_or_b32 v55, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v25
 ; ALIGNED-NEXT:    v_lshl_or_b32 v53, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v22, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v18, 8, v17
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v44, 8, v42
 ; ALIGNED-NEXT:    v_lshl_or_b32 v31, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v19, 8, v23
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
 ; ALIGNED-NEXT:    v_lshl_or_b32 v26, v95, 16, v4
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v13, 8, v14
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v21, v109, 16, v95
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v8, 8, v10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v9, 8, v7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v95, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v5, 8, v125
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v4, 8, v6
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v6
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
@@ -13509,7 +13410,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v0
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
@@ -13518,10 +13418,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:236
 ; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704
 ; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:708
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v127, 8, v109
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
@@ -13590,6 +13491,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v87 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v97 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v96 offset:201
@@ -13606,22 +13509,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:192
-; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v42 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v44 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v43 offset:185
@@ -13638,18 +13525,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v73 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v74 offset:176
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v78 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v76 offset:169
@@ -13666,6 +13541,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v107 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v120 offset:160
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:159
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
@@ -13678,10 +13583,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:154
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:153
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:159
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:157
@@ -14147,6 +14048,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:388
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:384
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:11
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:26
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
@@ -14200,8 +14103,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:10
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:11
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
@@ -14253,23 +14154,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_mov_b32 s7, -1
 ; ALIGNED-NEXT:  .LBB9_4: ; %memmove_bwd_loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x39
+; ALIGNED-NEXT:    s_clause 0x3e
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
@@ -14291,17 +14192,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69
@@ -14309,57 +14210,97 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_clause 0x34
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
-; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
-; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 8, v5
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v9, 8, v8
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
-; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
-; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
@@ -14368,75 +14309,88 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v36, 8, v35
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v50, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v5, 16, v3
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v64, 8, v54
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v65
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x5
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
 ; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14445,52 +14399,83 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
 ; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(50)
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v2
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v12, 8, v16
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
@@ -14709,313 +14694,180 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v2
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v123
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v93
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v91, 8, v94
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v107
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v77, 8, v78
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v47
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v58
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v60
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v44
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v41, 8, v42
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v118, 8, v119
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v117
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3e
-; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v100, 8, v101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    s_clause 0x4
 ; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    s_clause 0x6
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v8, 8, v10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v112, 8, v115
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v100, 8, v101
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
 ; ALIGNED-NEXT:    v_lshl_or_b32 v106, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v103, 8, v113
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v99, 8, v102
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v123
 ; ALIGNED-NEXT:    v_lshl_or_b32 v90, v3, 16, v2
-; ALIGNED-NEXT:    s_waitcnt vmcnt(60)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v87, 8, v97
-; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v93
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v86, 8, v96
-; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v12, 8, v16
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
-; ALIGNED-NEXT:    v_lshl_or_b32 v76, v8, 8, v10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
 ; ALIGNED-NEXT:    v_lshl_or_b32 v88, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v83, 8, v84
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v91, 8, v94
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v68, 8, v80
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v65, 8, v70
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v77, 8, v78
 ; ALIGNED-NEXT:    v_lshl_or_b32 v45, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v67, 8, v71
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v52, 8, v53
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
 ; ALIGNED-NEXT:    v_lshl_or_b32 v116, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v55, 8, v69
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
 ; ALIGNED-NEXT:    v_lshl_or_b32 v114, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v38, 8, v50
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v39
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v47
 ; ALIGNED-NEXT:    v_lshl_or_b32 v98, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v35, 8, v37
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v32, 8, v31
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v58
 ; ALIGNED-NEXT:    v_lshl_or_b32 v81, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v33, 8, v36
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v30, 8, v34
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v60
 ; ALIGNED-NEXT:    v_lshl_or_b32 v64, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v27, 8, v29
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v41, 8, v42
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v26, 8, v28
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v44
 ; ALIGNED-NEXT:    v_lshl_or_b32 v49, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v22, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v21
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v117
 ; ALIGNED-NEXT:    v_lshl_or_b32 v48, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v17, 8, v19
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v118, 8, v119
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
 ; ALIGNED-NEXT:    v_lshl_or_b32 v25, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v15, 8, v18
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
 ; ALIGNED-NEXT:    v_lshl_or_b32 v20, v62, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v9, 8, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v76, 16, v62
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v5, 8, v6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v7, 8, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v76, 16, v62
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1
 ; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
@@ -15027,36 +14879,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v109, 8, v104
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v122, 8, v110
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x3
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
-; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8
 ; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9
 ; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v110, 8, v122
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v110, 8, v122
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v120, 8, v109
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16
@@ -15065,10 +14915,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:492
 ; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:484
 ; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704
 ; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v4, 0xffffff00, v4
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v125, 8, v62
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
@@ -15137,6 +14988,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:476
 ; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:468
 ; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:464
+; ALIGNED-NEXT:    v_lshl_or_b32 v125, v0, 16, v125
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v82 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v85 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v83 offset:201
@@ -15153,22 +15006,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v102 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v113 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v115 offset:192
-; ALIGNED-NEXT:    v_lshl_or_b32 v125, v0, 16, v125
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
-; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v117 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v40 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v118 offset:185
@@ -15185,18 +15022,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v58 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v60 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v61 offset:176
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v63 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v75 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v72 offset:169
@@ -15213,6 +15038,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v94 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v107 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v108 offset:160
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v111 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v124 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v121 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v126 offset:159
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520
@@ -15225,10 +15080,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v111 offset:154
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v124 offset:155
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v121 offset:153
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v126 offset:159
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:157
@@ -15694,6 +15545,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:644
 ; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:640
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v109 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v120 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v127 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v110 offset:9
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:26
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
@@ -15747,10 +15602,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v109 offset:10
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v120 offset:11
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v127 offset:13
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v110 offset:9
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:15
@@ -15788,7 +15639,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB9_4
 ; ALIGNED-NEXT:  .LBB9_5: ; %Flow11
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x2f
+; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 71900a4d1c1e4..32800488f0633 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
 ; GFX12-GISEL-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
 ; GFX12-GISEL-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v2, off, s[16:19], null
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v3, off, s[20:23], null
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v4, off, s[40:43], null
+; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
-; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s1, 0xac0, v3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
 ; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 78207c2cf605e..1177474f5b4f5 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -185,44 +185,47 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_f32_e32 v4, s43, v4
 ; GFX900-NEXT:    v_add_f32_e32 v3, s42, v3
 ; GFX900-NEXT:    v_add_f32_e32 v2, s41, v2
 ; GFX900-NEXT:    v_add_f32_e32 v1, s40, v1
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v32, s19, v32
 ; GFX900-NEXT:    v_add_f32_e32 v31, s18, v31
 ; GFX900-NEXT:    v_add_f32_e32 v30, s17, v30
 ; GFX900-NEXT:    v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
 ; GFX900-NEXT:    v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_add_f32_e32 v12, s51, v12
 ; GFX900-NEXT:    v_add_f32_e32 v11, s50, v11
 ; GFX900-NEXT:    v_add_f32_e32 v10, s49, v10
 ; GFX900-NEXT:    v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_add_f32_e32 v16, s47, v16
 ; GFX900-NEXT:    v_add_f32_e32 v15, s46, v15
 ; GFX900-NEXT:    v_add_f32_e32 v14, s45, v14
 ; GFX900-NEXT:    v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_add_f32_e32 v20, s15, v20
 ; GFX900-NEXT:    v_add_f32_e32 v19, s14, v19
 ; GFX900-NEXT:    v_add_f32_e32 v18, s13, v18
 ; GFX900-NEXT:    v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v24, s11, v24
 ; GFX900-NEXT:    v_add_f32_e32 v23, s10, v23
 ; GFX900-NEXT:    v_add_f32_e32 v22, s9, v22
@@ -246,6 +249,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -255,9 +260,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -293,6 +296,8 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -302,9 +307,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -340,11 +343,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -354,22 +360,18 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@@ -409,6 +411,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -421,10 +426,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
@@ -1442,44 +1443,47 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_mul_f32_e32 v4, s43, v4
 ; GFX900-NEXT:    v_mul_f32_e32 v3, s42, v3
 ; GFX900-NEXT:    v_mul_f32_e32 v2, s41, v2
 ; GFX900-NEXT:    v_mul_f32_e32 v1, s40, v1
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT:    v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT:    v_mul_f32_e32 v6, s37, v6
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v32, s19, v32
 ; GFX900-NEXT:    v_mul_f32_e32 v31, s18, v31
 ; GFX900-NEXT:    v_mul_f32_e32 v30, s17, v30
 ; GFX900-NEXT:    v_mul_f32_e32 v29, s16, v29
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_mul_f32_e32 v6, s37, v6
 ; GFX900-NEXT:    v_mul_f32_e32 v5, s36, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_mul_f32_e32 v12, s51, v12
 ; GFX900-NEXT:    v_mul_f32_e32 v11, s50, v11
 ; GFX900-NEXT:    v_mul_f32_e32 v10, s49, v10
 ; GFX900-NEXT:    v_mul_f32_e32 v9, s48, v9
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_mul_f32_e32 v16, s47, v16
 ; GFX900-NEXT:    v_mul_f32_e32 v15, s46, v15
 ; GFX900-NEXT:    v_mul_f32_e32 v14, s45, v14
 ; GFX900-NEXT:    v_mul_f32_e32 v13, s44, v13
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_mul_f32_e32 v20, s15, v20
 ; GFX900-NEXT:    v_mul_f32_e32 v19, s14, v19
 ; GFX900-NEXT:    v_mul_f32_e32 v18, s13, v18
 ; GFX900-NEXT:    v_mul_f32_e32 v17, s12, v17
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v24, s11, v24
 ; GFX900-NEXT:    v_mul_f32_e32 v23, s10, v23
 ; GFX900-NEXT:    v_mul_f32_e32 v22, s9, v22
@@ -1503,6 +1507,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -1512,9 +1518,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -1550,6 +1554,8 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -1559,9 +1565,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -1597,11 +1601,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -1611,22 +1618,18 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@@ -1666,6 +1669,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1678,10 +1684,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
@@ -2273,44 +2275,47 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
-; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_fma_f32 v4, v4, s43, s43
 ; GFX900-NEXT:    v_fma_f32 v3, v3, s42, s42
 ; GFX900-NEXT:    v_fma_f32 v2, v2, s41, s41
 ; GFX900-NEXT:    v_fma_f32 v1, v1, s40, s40
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT:    v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT:    v_fma_f32 v6, v6, s37, s37
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v32, v32, s19, s19
 ; GFX900-NEXT:    v_fma_f32 v31, v31, s18, s18
 ; GFX900-NEXT:    v_fma_f32 v30, v30, s17, s17
 ; GFX900-NEXT:    v_fma_f32 v29, v29, s16, s16
+; GFX900-NEXT:    s_waitcnt vmcnt(4)
+; GFX900-NEXT:    v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT:    v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT:    v_fma_f32 v6, v6, s37, s37
 ; GFX900-NEXT:    v_fma_f32 v5, v5, s36, s36
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_fma_f32 v12, v12, s51, s51
 ; GFX900-NEXT:    v_fma_f32 v11, v11, s50, s50
 ; GFX900-NEXT:    v_fma_f32 v10, v10, s49, s49
 ; GFX900-NEXT:    v_fma_f32 v9, v9, s48, s48
+; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_fma_f32 v16, v16, s47, s47
 ; GFX900-NEXT:    v_fma_f32 v15, v15, s46, s46
 ; GFX900-NEXT:    v_fma_f32 v14, v14, s45, s45
 ; GFX900-NEXT:    v_fma_f32 v13, v13, s44, s44
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_fma_f32 v20, v20, s15, s15
 ; GFX900-NEXT:    v_fma_f32 v19, v19, s14, s14
 ; GFX900-NEXT:    v_fma_f32 v18, v18, s13, s13
 ; GFX900-NEXT:    v_fma_f32 v17, v17, s12, s12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v24, v24, s11, s11
 ; GFX900-NEXT:    v_fma_f32 v23, v23, s10, s10
 ; GFX900-NEXT:    v_fma_f32 v22, v22, s9, s9
@@ -2334,6 +2339,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -2343,9 +2350,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -2381,6 +2386,8 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -2390,9 +2397,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
 ; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -2430,6 +2435,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2442,10 +2450,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[40:41]
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[42:43]
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[50:51]
@@ -2496,6 +2500,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -2508,10 +2515,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index d0d5cc11994af..025d9e63436d7 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -56,11 +56,11 @@ body:             |
     ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
     ; GCN-NEXT:   $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
     ; GCN-NEXT:   $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
     ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
     ; GCN-NEXT:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
     ; GCN-NEXT:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
     ; GCN-NEXT: }
@@ -359,6 +359,7 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABLE: name: no_sched_barrier_within_bundle
+    ; GCN-LABEL: name: no_sched_barrier_within_bundle
     ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
index 5fea0aee72ec7..e0266b9f1a5b0 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
@@ -9,7 +9,7 @@ body:             |
     ; GFX12-LABEL: name: post_bundle_vimage
     ; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
     ; GFX12-NEXT:   $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
     ; GFX12-NEXT:   $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
     ; GFX12-NEXT: }
@@ -25,7 +25,7 @@ body:             |
     ; GFX12-LABEL: name: post_bundle_vsample
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
     ; GFX12-NEXT:   $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
     ; GFX12-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
     ; GFX12-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 85a9aba1a0e51..b91bdd2b2fa71 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffc800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffd000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffd800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5]
@@ -514,10 +514,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
-; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
-; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
 ; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
 ; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
@@ -526,13 +524,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s3, v2
 ; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
 ; GFX900-NEXT:    s_addk_i32 s5, 0x2000
 ; GFX900-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_co_u32_e32 v22, vcc, v8, v4
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_co_u32_e64 v24, s[0:1], v18, v22
 ; GFX900-NEXT:    v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
@@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_waitcnt vmcnt(7)
 ; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v24
 ; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
 ; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x10000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_waitcnt vmcnt(7)
 ; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v20
 ; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(4)
@@ -734,10 +734,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
-; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
 ; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
+; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
 ; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
 ; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
@@ -753,39 +751,42 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
 ; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
 ; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX90A-NEXT:    s_addk_i32 s3, 0x2000
 ; GFX90A-NEXT:    s_cmp_gt_u32 s3, 0x3fffff
-; GFX90A-NEXT:    s_waitcnt vmcnt(8)
+; GFX90A-NEXT:    s_waitcnt vmcnt(10)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(7)
+; GFX90A-NEXT:    s_waitcnt vmcnt(9)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-NEXT:    s_waitcnt vmcnt(8)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-NEXT:    s_waitcnt vmcnt(7)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    s_waitcnt vmcnt(6)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v24, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(3)
+; GFX90A-NEXT:    s_waitcnt vmcnt(5)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v26, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    s_waitcnt vmcnt(3)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8aed56e..fb9c47731eb42 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -28,43 +28,38 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -74,49 +69,40 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -125,6 +111,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -139,6 +138,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -153,84 +153,70 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -238,6 +224,19 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -252,6 +251,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -266,83 +266,69 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -350,6 +336,18 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -364,6 +362,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -392,7 +392,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -414,7 +413,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -422,24 +420,23 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -447,9 +444,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -457,6 +453,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -475,6 +474,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -503,7 +504,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -525,7 +525,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -533,24 +532,23 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -558,9 +556,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -568,6 +565,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -586,6 +586,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -974,42 +976,43 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -1024,8 +1027,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -1051,43 +1053,38 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -1097,49 +1094,40 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1148,6 +1136,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -1162,6 +1163,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1176,84 +1178,70 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1261,6 +1249,19 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -1275,6 +1276,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1289,83 +1291,69 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -1373,6 +1361,18 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -1387,6 +1387,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1415,7 +1417,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1437,7 +1438,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1445,24 +1445,23 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -1470,9 +1469,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1480,6 +1478,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1498,6 +1499,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1526,7 +1529,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1548,7 +1550,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1556,24 +1557,23 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -1581,9 +1581,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1591,6 +1590,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1609,6 +1611,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1997,42 +2001,43 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -2047,8 +2052,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -2074,43 +2078,38 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -2120,49 +2119,40 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2171,6 +2161,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -2185,6 +2188,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2199,84 +2203,70 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2284,6 +2274,19 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -2298,6 +2301,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2312,83 +2316,69 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -2396,6 +2386,18 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -2410,6 +2412,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2438,7 +2442,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2460,7 +2463,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2468,24 +2470,23 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -2493,9 +2494,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2503,6 +2503,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2521,6 +2524,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2549,7 +2554,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2571,7 +2575,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2579,24 +2582,23 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -2604,9 +2606,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2614,6 +2615,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2632,6 +2636,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3020,42 +3026,43 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -3070,8 +3077,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -3091,49 +3097,44 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-LABEL: hs_main:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s7, 0xe8f000
-; SI-NEXT:    s_add_u32 s4, s4, s0
-; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s7, 0xe8f000
+; SI-NEXT:    s_add_u32 s4, s4, s0
+; SI-NEXT:    s_addc_u32 s5, s5, 0
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -3143,49 +3144,40 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3194,6 +3186,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -3208,6 +3213,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3222,84 +3228,70 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3307,6 +3299,19 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -3321,6 +3326,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3334,83 +3340,69 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -3418,6 +3410,18 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -3432,6 +3436,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3459,7 +3465,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3481,7 +3486,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3489,24 +3493,23 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -3514,9 +3517,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3524,6 +3526,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3542,6 +3547,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3569,7 +3576,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3591,7 +3597,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3599,24 +3604,23 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -3624,9 +3628,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3634,6 +3637,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3652,6 +3658,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4040,42 +4048,43 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -4090,8 +4099,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -4117,43 +4125,38 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -4163,49 +4166,40 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4214,6 +4208,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -4228,6 +4235,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4242,84 +4250,70 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4327,6 +4321,19 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -4341,6 +4348,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4354,83 +4362,69 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -4438,6 +4432,18 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -4452,6 +4458,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4479,7 +4487,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4501,7 +4508,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4509,24 +4515,23 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -4534,9 +4539,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4544,6 +4548,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4562,6 +4569,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4589,7 +4598,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4611,7 +4619,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4619,24 +4626,23 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -4644,9 +4650,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4654,6 +4659,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4672,6 +4680,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -5060,42 +5070,43 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -5110,8 +5121,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -5141,43 +5151,38 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    s_mov_b32 s11, 0xe8f000
 ; SI-NEXT:    s_add_u32 s8, s8, s6
 ; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -5187,49 +5192,40 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5238,6 +5234,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5252,6 +5261,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; SI-NEXT:    s_mov_b32 s2, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -5267,84 +5277,70 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    s_mov_b32 s11, 0xe80000
 ; VI-NEXT:    s_add_u32 s8, s8, s6
 ; VI-NEXT:    s_addc_u32 s9, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
-; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
-; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5352,6 +5348,19 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5366,6 +5375,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; VI-NEXT:    s_mov_b32 s2, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -5380,83 +5390,69 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s8, s8, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5464,6 +5460,18 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5478,8 +5486,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-MUBUF-NEXT:    ; return to shader part epilog
@@ -5491,10 +5500,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W32-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W32-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -5505,8 +5514,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5528,7 +5535,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5536,24 +5542,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -5561,9 +5568,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5571,6 +5577,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5589,6 +5597,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -5602,10 +5612,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W64-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W64-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -5616,8 +5626,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5639,7 +5647,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5647,24 +5654,25 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -5672,9 +5680,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5682,6 +5689,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5700,6 +5709,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -6093,10 +6104,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0x3f5f2ee2
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:320
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
@@ -6105,29 +6116,31 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbe31934f
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -6142,8 +6155,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -6172,43 +6184,38 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    s_mov_b32 s11, 0xe8f000
 ; SI-NEXT:    s_add_u32 s8, s8, s6
 ; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
-; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
-; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
-; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
 ; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -6218,49 +6225,40 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6269,6 +6267,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6283,6 +6294,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; SI-NEXT:    s_mov_b32 s2, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -6298,84 +6310,70 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    s_mov_b32 s11, 0xe80000
 ; VI-NEXT:    s_add_u32 s8, s8, s6
 ; VI-NEXT:    s_addc_u32 s9, s9, 0
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6383,6 +6381,19 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6397,6 +6408,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; VI-NEXT:    s_mov_b32 s2, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -6411,83 +6423,69 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s8, s8, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6495,6 +6493,18 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6509,8 +6519,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-MUBUF-NEXT:    ; return to shader part epilog
@@ -6522,10 +6533,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W32-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W32-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -6536,8 +6547,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6559,7 +6568,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6567,24 +6575,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -6592,9 +6601,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6602,6 +6610,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6620,6 +6630,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -6633,10 +6645,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W64-MUBUF-NEXT:    s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W64-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -6647,8 +6659,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6670,7 +6680,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6678,24 +6687,25 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -6703,9 +6713,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6713,6 +6722,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6731,6 +6742,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -7124,10 +7137,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0x3f5f2ee2
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:320
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v4, v6
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
@@ -7136,29 +7149,31 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbe31934f
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
-; GFX11-FLATSCR-NEXT:    s_clause 0x3
+; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -7173,8 +7188,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index 71e4755b58bf2..c90d7887f2ff6 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -3,9 +3,6 @@
 define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
 ; CHECK-LABEL: excess_soft_clause_reg_pressure:
 ; CHECK:  BB0_1: ; %for.cond28.preheader
-; CHECK:         s_load_dwordx16
-; CHECK-NEXT:    s_load_dwordx16
-
 ; CHECK:         global_load_dword
 ; CHECK-NEXT:    global_load_dword
 ; CHECK-NEXT:    global_load_dword
@@ -18,11 +15,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
 ; CHECK-NOT: v_readlane_b32
 
 ; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
 ; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
+
+; CHECK-NOT: v_writelane_b32
+; CHECK-NOT: v_readlane_b32
+
 ; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
 
 ; CHECK-NOT: v_writelane_b32
 ; CHECK-NOT: v_readlane_b32
+
 entry:
   %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i2 = load i64, ptr addrspace(4) %i, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index da48af100d27b..1a0f75e048cb9 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
-; GFX90A-NEXT:    s_nop 4
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
 ; GFX90A-NEXT:    buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b62b3397..b5474b8974b29 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10314,7 +10314,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
@@ -10327,12 +10328,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
@@ -10344,7 +10343,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
@@ -10358,10 +10359,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
@@ -10468,13 +10466,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 9cb22dad86b88..802de8037cf6b 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 2
-; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 3
 ; GCN-NEXT:    s_mov_b32 s34, s32
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s34
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index ee2e58f2a6cc1..a1771f9356014 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -98,28 +98,29 @@ body:             |
 
     ; CHECK-LABEL: name: foo
     ; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
-    ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
-    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
-    ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
-    ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
-    ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
-    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
-    ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
-    ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
-    ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
-    ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
-    ; CHECK:   MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
-    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
-    ; CHECK:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
-    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
-    ; CHECK:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
-    ; CHECK: }
-    ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
-    ; CHECK:   MVE_VPST 4, implicit $vpr
-    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
-    ; CHECK:   MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
-    ; CHECK: }
-    ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
+    ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
+    ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
+    ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
+    ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
+    ; CHECK-NEXT:   MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
+    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
+    ; CHECK-NEXT:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
+    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
+    ; CHECK-NEXT:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
+    ; CHECK-NEXT:   MVE_VPST 4, implicit $vpr
+    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
+    ; CHECK-NEXT:   MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
     $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4

From 66f52ca9a09c6e265385046f36ac51a863078948 Mon Sep 17 00:00:00 2001
From: Saleem Abdulrasool <compnerd@compnerd.org>
Date: Tue, 4 Nov 2025 23:22:47 -0800
Subject: [PATCH 283/313] test: correct typo in RUN line (#166511)

Correct a typo in the triple that is used for the test. Because the OS
was not recognised, it would fall to the non-Windows code generation.
---
 llvm/test/CodeGen/AArch64/preserve_mostcc.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
index 75c8567e2095e..f77ada4eae022 100644
--- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s
-; RUN: llc < %s -mtriple=aarch64-unknown-windiws-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
+; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
 
 declare void @standard_cc_func()
 declare preserve_mostcc void @preserve_mostcc_func()

From 9016c60c685e8f651fa9f9250703afe5f2c02565 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 5 Nov 2025 08:39:34 +0100
Subject: [PATCH 284/313] [clang] Call ActOnCaseExpr even if the 'case' is
 missing (#166326)

This otherwise happens in ParseCaseExpression.
If we don't call this, we don't perform the usual arithmetic
conversions, etc.
---
 clang/lib/Parse/ParseStmt.cpp                    | 2 +-
 clang/test/SemaCXX/constant-expression-cxx14.cpp | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 92038985f9163..fb45db1139349 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -813,7 +813,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
           return StmtError();
       }
     } else {
-      LHS = Expr;
+      LHS = Actions.ActOnCaseExpr(CaseLoc, Expr);
       MissingCase = false;
     }
 
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index bea90ff7eaf8a..1fc6e5ec4cc55 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1450,3 +1450,9 @@ namespace GH149500 {
   unsigned int * p = &(*(unsigned int *)0x400);
   static const void *q = &(*(const struct sysrq_key_op *)0);
 }
+
+constexpr bool missingCase() {
+  switch (1) {
+    1u: return false; // expected-error {{expected 'case' keyword before expression}}
+  }
+}

From 98f0139f5bd9f8caafea5b7dc3ed27ec7ee4408f Mon Sep 17 00:00:00 2001
From: Karlo Basioli <basioli@google.com>
Date: Wed, 5 Nov 2025 09:41:04 +0100
Subject: [PATCH 285/313] Fix bazel build issue caused by #166259 (#166519)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index bcee6c4e243a4..ad013035251f5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2551,6 +2551,7 @@ llvm_target_lib_list = [lib for lib in [
             "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"],
             "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"],
             "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"],
+            "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"],
         },
     },
     {

From 988c1b1c8dc87b5f373046c999d47707e358bdd0 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 5 Nov 2025 09:42:35 +0100
Subject: [PATCH 286/313] [libc++] Remove <cstdlib> include from <exception>
 (#166340)

---
 libcxx/include/__exception/exception_ptr.h       | 7 ++++---
 libcxx/include/exception                         | 5 ++++-
 libcxx/test/libcxx/transitive_includes/cxx26.csv | 1 -
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index e78126ea23852..aef036a2c9586 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -11,6 +11,7 @@
 
 #include <__config>
 #include <__cstddef/nullptr_t.h>
+#include <__cstddef/size_t.h>
 #include <__exception/operations.h>
 #include <__memory/addressof.h>
 #include <__memory/construct_at.h>
@@ -18,7 +19,7 @@
 #include <__type_traits/is_pointer.h>
 #include <__utility/move.h>
 #include <__utility/swap.h>
-#include <cstdlib>
+#include <__verbose_abort>
 #include <typeinfo>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -35,7 +36,7 @@ _LIBCPP_PUSH_MACROS
 namespace __cxxabiv1 {
 
 extern "C" {
-_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw();
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(std::size_t) throw();
 _LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw();
 
 struct __cxa_exception;
@@ -174,7 +175,7 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #  else  // !_LIBCPP_HAS_EXCEPTIONS
 template <class _Ep>
 _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT {
-  std::abort();
+  _LIBCPP_VERBOSE_ABORT("make_exception_ptr was called in -fno-exceptions mode");
 }
 #  endif // _LIBCPP_HAS_EXCEPTIONS
 
diff --git a/libcxx/include/exception b/libcxx/include/exception
index 74229cd16c006..0b2372e571e99 100644
--- a/libcxx/include/exception
+++ b/libcxx/include/exception
@@ -93,10 +93,13 @@ template <class E> void rethrow_if_nested(const E& e);
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstddef>
-#    include <cstdlib>
 #    include <new>
 #    include <type_traits>
 #  endif
+
+#  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23
+#    include <cstdlib>
+#  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_EXCEPTION
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index d047b29b63cc6..8c3e1f0a97dfe 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -245,7 +245,6 @@ deque stdexcept
 deque tuple
 deque version
 exception cstdint
-exception cstdlib
 exception typeinfo
 exception version
 execution version

From 51d0f6d6e172c133e1fbd87447bba49a2ad7e6fe Mon Sep 17 00:00:00 2001
From: mitchell <mitchell.xu2@gmail.com>
Date: Wed, 5 Nov 2025 16:43:27 +0800
Subject: [PATCH 287/313] [clang-tidy] Rename `cert-dcl58-cpp` to
 `bugprone-std-namespace-modification` (#165659)

Closes [#157290](https://github.com/llvm/llvm-project/issues/157290)
---
 .../bugprone/BugproneTidyModule.cpp           |  3 +
 .../clang-tidy/bugprone/CMakeLists.txt        |  1 +
 .../StdNamespaceModificationCheck.cpp}        | 10 +--
 .../StdNamespaceModificationCheck.h}          | 16 ++---
 .../clang-tidy/cert/CERTTidyModule.cpp        |  5 +-
 .../clang-tidy/cert/CMakeLists.txt            |  1 -
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 ++
 .../bugprone/std-namespace-modification.rst   | 63 +++++++++++++++++++
 .../docs/clang-tidy/checks/cert/dcl58-cpp.rst | 54 +---------------
 .../docs/clang-tidy/checks/list.rst           |  3 +-
 .../Inputs/Headers/system-header-simulation.h |  2 +-
 .../std-namespace-modification.cpp}           |  4 +-
 12 files changed, 96 insertions(+), 71 deletions(-)
 rename clang-tools-extra/clang-tidy/{cert/DontModifyStdNamespaceCheck.cpp => bugprone/StdNamespaceModificationCheck.cpp} (95%)
 rename clang-tools-extra/clang-tidy/{cert/DontModifyStdNamespaceCheck.h => bugprone/StdNamespaceModificationCheck.h} (61%)
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
 rename clang-tools-extra/test/clang-tidy/checkers/{cert/dcl58-cpp.cpp => bugprone/std-namespace-modification.cpp} (97%)

diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 2e21a4c4fd1f9..611717a64b768 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -73,6 +73,7 @@
 #include "SizeofExpressionCheck.h"
 #include "SpuriouslyWakeUpFunctionsCheck.h"
 #include "StandaloneEmptyCheck.h"
+#include "StdNamespaceModificationCheck.h"
 #include "StringConstructorCheck.h"
 #include "StringIntegerAssignmentCheck.h"
 #include "StringLiteralWithEmbeddedNulCheck.h"
@@ -237,6 +238,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-spuriously-wake-up-functions");
     CheckFactories.registerCheck<StandaloneEmptyCheck>(
         "bugprone-standalone-empty");
+    CheckFactories.registerCheck<StdNamespaceModificationCheck>(
+        "bugprone-std-namespace-modification");
     CheckFactories.registerCheck<StringConstructorCheck>(
         "bugprone-string-constructor");
     CheckFactories.registerCheck<StringIntegerAssignmentCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 31a0e6906866a..6c96996458040 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -75,6 +75,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   SmartPtrArrayMismatchCheck.cpp
   SpuriouslyWakeUpFunctionsCheck.cpp
   StandaloneEmptyCheck.cpp
+  StdNamespaceModificationCheck.cpp
   StringConstructorCheck.cpp
   StringIntegerAssignmentCheck.cpp
   StringLiteralWithEmbeddedNulCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
similarity index 95%
rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
index 79fbc66b5f8a3..13e5c03d7c4d3 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DontModifyStdNamespaceCheck.h"
+#include "StdNamespaceModificationCheck.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 
@@ -36,9 +36,9 @@ AST_POLYMORPHIC_MATCHER_P(
 
 } // namespace
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) {
+void StdNamespaceModificationCheck::registerMatchers(MatchFinder *Finder) {
   auto HasStdParent =
       hasDeclContext(namespaceDecl(hasAnyName("std", "posix"),
                                    unless(hasParent(namespaceDecl())))
@@ -96,7 +96,7 @@ void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) {
                          .bind("decl"),
                      this);
 }
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
 static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) {
   const NamespaceDecl *LastNS = nullptr;
@@ -108,7 +108,7 @@ static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) {
   return LastNS;
 }
 
-void clang::tidy::cert::DontModifyStdNamespaceCheck::check(
+void clang::tidy::bugprone::StdNamespaceModificationCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *D = Result.Nodes.getNodeAs<Decl>("decl");
   const auto *NS = Result.Nodes.getNodeAs<NamespaceDecl>("nmspc");
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
similarity index 61%
rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
index cfcd878644ddb..0f62dc3d9ab70 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
@@ -6,21 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Modification of the std or posix namespace can result in undefined behavior.
 /// This check warns for such modifications.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/dcl58-cpp.html
-class DontModifyStdNamespaceCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/std-namespace-modification.html
+class StdNamespaceModificationCheck : public ClangTidyCheck {
 public:
-  DontModifyStdNamespaceCheck(StringRef Name, ClangTidyContext *Context)
+  StdNamespaceModificationCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -29,6 +29,6 @@ class DontModifyStdNamespaceCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index d5179770008e5..15592d5c03c06 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -19,6 +19,7 @@
 #include "../bugprone/SignedCharMisuseCheck.h"
 #include "../bugprone/SizeofExpressionCheck.h"
 #include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h"
+#include "../bugprone/StdNamespaceModificationCheck.h"
 #include "../bugprone/SuspiciousMemoryComparisonCheck.h"
 #include "../bugprone/ThrowingStaticInitializationCheck.h"
 #include "../bugprone/UncheckedStringToNumberConversionCheck.h"
@@ -36,7 +37,6 @@
 #include "../performance/MoveConstructorInitCheck.h"
 #include "../readability/EnumInitialValueCheck.h"
 #include "../readability/UppercaseLiteralSuffixCheck.h"
-#include "DontModifyStdNamespaceCheck.h"
 #include "FloatLoopCounter.h"
 #include "LimitedRandomnessCheck.h"
 #include "MutatingCopyCheck.h"
@@ -251,7 +251,8 @@ class CERTModule : public ClangTidyModule {
         "cert-dcl51-cpp");
     CheckFactories.registerCheck<misc::NewDeleteOverloadsCheck>(
         "cert-dcl54-cpp");
-    CheckFactories.registerCheck<DontModifyStdNamespaceCheck>("cert-dcl58-cpp");
+    CheckFactories.registerCheck<bugprone::StdNamespaceModificationCheck>(
+        "cert-dcl58-cpp");
     CheckFactories.registerCheck<google::build::UnnamedNamespaceInHeaderCheck>(
         "cert-dcl59-cpp");
     // ERR
diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
index db3b2f5a08286..27cf392ce0bb1 100644
--- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
@@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangTidyCERTModule STATIC
   CERTTidyModule.cpp
-  DontModifyStdNamespaceCheck.cpp
   FloatLoopCounter.cpp
   LimitedRandomnessCheck.cpp
   MutatingCopyCheck.cpp
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 26f00e9a8a294..88eb0ebff4501 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -244,6 +244,11 @@ New check aliases
   <clang-tidy/checks/modernize/avoid-variadic-functions>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-dcl58-cpp <clang-tidy/checks/cert/dcl58-cpp>` to
+  :doc:`bugprone-std-namespace-modification
+  <clang-tidy/checks/bugprone/std-namespace-modification>`
+  keeping initial check as an alias to the new one.
+
 - Renamed :doc:`cert-env33-c <clang-tidy/checks/cert/env33-c>` to
   :doc:`bugprone-command-processor
   <clang-tidy/checks/bugprone/command-processor>`
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
new file mode 100644
index 0000000000000..c6e5608280264
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
@@ -0,0 +1,63 @@
+.. title:: clang-tidy - bugprone-std-namespace-modification
+
+bugprone-std-namespace-modification
+===================================
+
+Warns on modifications of the ``std`` or ``posix`` namespaces which can
+result in undefined behavior.
+
+The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
+function) template specializations that depend on an user-defined type (a type
+that is not defined in the standard system headers).
+
+The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
+
+- Anything that is not a template specialization.
+- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
+- Explicit specializations of any member function of a standard library class template.
+- Explicit specializations of any member function template of a standard library class or class template.
+- Explicit or partial specialization of any member class template of a standard library class or class template.
+
+Examples:
+
+.. code-block:: c++
+
+  namespace std {
+    int x; // warning: modification of 'std' namespace can result in undefined behavior [bugprone-dont-modify-std-namespace]
+  }
+
+  namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior
+  }
+
+  template <>
+  struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior
+    unsigned long operator()(const long &K) const {
+      return K;
+    }
+  };
+
+  struct MyData { long data; };
+
+  template <>
+  struct ::std::hash<MyData> { // no warning: specialization with user-defined type
+    unsigned long operator()(const MyData &K) const {
+      return K.data;
+    }
+  };
+
+  namespace std {
+    template <>
+    void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior
+
+    template <>
+    bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior
+      return true;
+    }
+  }
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rule
+`DCL58-CPP. Do not modify the standard namespaces
+<https://www.securecoding.cert.org/confluence/display/cplusplus/DCL58-CPP.+Do+not+modify+the+standard+namespaces>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
index fbcc6281a8898..1b8c2c4f97dde 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
@@ -3,57 +3,9 @@
 cert-dcl58-cpp
 ==============
 
-Modification of the ``std`` or ``posix`` namespace can result in undefined
-behavior.
-This check warns for such modifications.
-The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
-function) template specializations that depend on an user-defined type (a type
-that is not defined in the standard system headers).
-
-The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
-
-- Anything that is not a template specialization.
-- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
-- Explicit specializations of any member function of a standard library class template.
-- Explicit specializations of any member function template of a standard library class or class template.
-- Explicit or partial specialization of any member class template of a standard library class or class template.
-
-Examples:
-
-.. code-block:: c++
-
-  namespace std {
-    int x; // warning: modification of 'std' namespace can result in undefined behavior [cert-dcl58-cpp]
-  }
-
-  namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior
-  }
-
-  template <>
-  struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior
-    unsigned long operator()(const long &K) const {
-      return K;
-    }
-  };
-
-  struct MyData { long data; };
-
-  template <>
-  struct ::std::hash<MyData> { // no warning: specialization with user-defined type
-    unsigned long operator()(const MyData &K) const {
-      return K.data;
-    }
-  };
-
-  namespace std {
-    template <>
-    void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior
-
-    template <>
-    bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior
-      return true;
-    }
-  }
+The `cert-dcl58-cpp` is an aliaes, please see
+`bugprone-std-namespace-modification <../bugprone/std-namespace-modification.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `DCL58-CPP. Do not modify the standard namespaces
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index e14ac715cfeeb..5094bcc90caf3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -141,6 +141,7 @@ Clang-Tidy Checks
    :doc:`bugprone-sizeof-expression <bugprone/sizeof-expression>`,
    :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
    :doc:`bugprone-standalone-empty <bugprone/standalone-empty>`, "Yes"
+   :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`,
    :doc:`bugprone-string-constructor <bugprone/string-constructor>`, "Yes"
    :doc:`bugprone-string-integer-assignment <bugprone/string-integer-assignment>`, "Yes"
    :doc:`bugprone-string-literal-with-embedded-nul <bugprone/string-literal-with-embedded-nul>`,
@@ -175,7 +176,6 @@ Clang-Tidy Checks
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
-   :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`,
    :doc:`cert-err33-c <cert/err33-c>`,
    :doc:`cert-err60-cpp <cert/err60-cpp>`,
    :doc:`cert-flp30-c <cert/flp30-c>`,
@@ -441,6 +441,7 @@ Check aliases
    :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`, :doc:`modernize-avoid-variadic-functions <modernize/avoid-variadic-functions>`,
    :doc:`cert-dcl51-cpp <cert/dcl51-cpp>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
    :doc:`cert-dcl54-cpp <cert/dcl54-cpp>`, :doc:`misc-new-delete-overloads <misc/new-delete-overloads>`,
+   :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`, :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`,
    :doc:`cert-dcl59-cpp <cert/dcl59-cpp>`, :doc:`google-build-namespaces <google/build-namespaces>`,
    :doc:`cert-env33-c <cert/env33-c>`, :doc:`bugprone-command-processor <bugprone/command-processor>`,
    :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
index b6977cd9ce6c6..0870f60eaa39b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
@@ -59,7 +59,7 @@ struct X {};
 } // namespace std
 
 // Template specializations that are in a system-header file.
-// The purpose is to test cert-dcl58-cpp (no warnings here).
+// The purpose is to test bugprone-std-namespace-modification (no warnings here).
 namespace std {
 template <>
 void swap<short>(short &, short &){};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
similarity index 97%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
index 01964e7dc6c76..32bcbcaa21c0d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++17-or-later %s cert-dcl58-cpp %t -- -- -I %clang_tidy_headers
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-std-namespace-modification %t -- -- -I %clang_tidy_headers
 
 #include "system-header-simulation.h"
 
@@ -15,7 +15,7 @@ namespace A {
 }
 
 namespace posix {
-// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [cert-dcl58-cpp]
+// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [bugprone-std-namespace-modification]
 // CHECK-MESSAGES: :[[@LINE-2]]:11: note: 'posix' namespace opened here
 namespace foo {
 int foobar;

From 5b5d0a831967ea49a3b4642923308d08fb69b06c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 5 Nov 2025 10:00:47 +0100
Subject: [PATCH 288/313] [libc++][NFC] Make __type_info_implementations a
 namespace (#166339)

There doesn't seem much of a reason why this should be a struct. Make it
a namespace instead.
---
 libcxx/include/typeinfo | 160 ++++++++++++++++++++--------------------
 1 file changed, 80 insertions(+), 80 deletions(-)

diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 24aaabf0a87df..f608c94d3031e 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -186,99 +186,99 @@ public:
 #        endif
 #      endif
 
-struct __type_info_implementations {
-  struct __string_impl_base {
-    typedef const char* __type_name_t;
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char*
-    __type_name_to_string(__type_name_t __v) _NOEXCEPT {
-      return __v;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t
-    __string_to_type_name(const char* __v) _NOEXCEPT {
-      return __v;
-    }
-  };
+namespace __type_info_implementations {
+struct __string_impl_base {
+  typedef const char* __type_name_t;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char*
+  __type_name_to_string(__type_name_t __v) _NOEXCEPT {
+    return __v;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t
+  __string_to_type_name(const char* __v) _NOEXCEPT {
+    return __v;
+  }
+};
 
-  struct __unique_impl : __string_impl_base {
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
-      return reinterpret_cast<size_t>(__v);
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs == __rhs;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs < __rhs;
-    }
-  };
-
-  struct __non_unique_impl : __string_impl_base {
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT {
-      size_t __hash = 5381;
-      while (unsigned char __c = static_cast<unsigned char>(*__ptr++))
-        __hash = (__hash * 33) ^ __c;
-      return __hash;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __builtin_strcmp(__lhs, __rhs) < 0;
-    }
-  };
+struct __unique_impl : __string_impl_base {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
+    return reinterpret_cast<size_t>(__v);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs == __rhs;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs < __rhs;
+  }
+};
+
+struct __non_unique_impl : __string_impl_base {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT {
+    size_t __hash = 5381;
+    while (unsigned char __c = static_cast<unsigned char>(*__ptr++))
+      __hash = (__hash * 33) ^ __c;
+    return __hash;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __builtin_strcmp(__lhs, __rhs) < 0;
+  }
+};
 
-  struct __non_unique_arm_rtti_bit_impl {
-    typedef uintptr_t __type_name_t;
+struct __non_unique_arm_rtti_bit_impl {
+  typedef uintptr_t __type_name_t;
 
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT {
-      return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value);
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT {
-      return reinterpret_cast<__type_name_t>(__v);
-    }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT {
+    return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT {
+    return reinterpret_cast<__type_name_t>(__v);
+  }
 
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
-      if (__is_type_name_unique(__v))
-        return __v;
-      return __non_unique_impl::__hash(__type_name_to_string(__v));
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      if (__lhs == __rhs)
-        return true;
-      if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
-        // Either both are unique and have a different address, or one of them
-        // is unique and the other one isn't. In both cases they are unequal.
-        return false;
-      return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
-        return __lhs < __rhs;
-      return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0;
-    }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
+    if (__is_type_name_unique(__v))
+      return __v;
+    return __non_unique_impl::__hash(__type_name_to_string(__v));
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    if (__lhs == __rhs)
+      return true;
+    if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
+      // Either both are unique and have a different address, or one of them
+      // is unique and the other one isn't. In both cases they are unequal.
+      return false;
+    return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
+      return __lhs < __rhs;
+    return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0;
+  }
 
-  private:
-    // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
-    // this implementation is actually used.
-    typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))>
-        __non_unique_rtti_bit;
+private:
+  // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
+  // this implementation is actually used.
+  typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))>
+      __non_unique_rtti_bit;
 
-    _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT {
-      return !(__lhs & __non_unique_rtti_bit::value);
-    }
-  };
+  _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT {
+    return !(__lhs & __non_unique_rtti_bit::value);
+  }
+};
 
-  typedef
+typedef
 #      if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1
-      __unique_impl
+    __unique_impl
 #      elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2
-      __non_unique_impl
+    __non_unique_impl
 #      elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3
-      __non_unique_arm_rtti_bit_impl
+    __non_unique_arm_rtti_bit_impl
 #      else
 #        error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #      endif
-          __impl;
-};
+        __impl;
+} // namespace __type_info_implementations
 
 #      if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__)
 #        if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)

From 628d53aba53204b8b7eac69b200b04bc4433deac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Spaits?= <gaborspaits1@gmail.com>
Date: Wed, 5 Nov 2025 10:04:32 +0100
Subject: [PATCH 289/313] [InstCombine] Enable FoldOpIntoSelect and
 foldOpIntoPhi when the Op's other parameter is non-const (#166102)

This patch enables `FoldOpIntoSelect` and `foldOpIntoPhi` for the cases
when Op's second parameter is a non-constant.
It doesn't seem to bring significant improvements, but the compile
time impact is neglegable.
---
 .../InstCombine/InstructionCombining.cpp      |   6 +-
 .../InstCombine/binop-phi-operands.ll         |   5 +-
 .../Transforms/InstCombine/binop-select.ll    | 187 +++++++++++++++++-
 .../InstCombine/dont-distribute-phi.ll        |  10 +-
 llvm/test/Transforms/InstCombine/fmul.ll      |   4 +-
 .../Transforms/InstCombine/free-inversion.ll  |   4 +-
 .../InstCombine/or-select-zero-icmp.ll        | 169 ++++++++++++++++
 .../test/Transforms/InstCombine/recurrence.ll |   4 +-
 llvm/test/Transforms/InstCombine/sub-gep.ll   |   3 +-
 .../AArch64/predicated-reduction.ll           | 105 +++++-----
 10 files changed, 424 insertions(+), 73 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 67f837c7ed968..b158e0f626850 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
 }
 
 Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
-  if (!isa<Constant>(I.getOperand(1)))
-    return nullptr;
+  bool IsOtherParamConst = isa<Constant>(I.getOperand(1));
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
-    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+    if (Instruction *NewSel =
+            FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst))
       return NewSel;
   } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
     if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
diff --git a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
index 9e049837b0352..f0d4ad74fbe05 100644
--- a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
+++ b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
@@ -653,12 +653,11 @@ define i8 @mul_const_incoming0_speculatable(i1 %b, i8 %x, i8 %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[THEN:%.*]]
 ; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br label [[THEN]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[P0:%.*]] = phi i8 [ 42, [[ENTRY:%.*]] ], [ [[X:%.*]], [[IF]] ]
-; CHECK-NEXT:    [[P1:%.*]] = phi i8 [ 17, [[ENTRY]] ], [ [[Y:%.*]], [[IF]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ -54, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ]
 ; CHECK-NEXT:    call void @sideeffect()
-; CHECK-NEXT:    [[R:%.*]] = mul i8 [[P0]], [[P1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll
index 25f624ee13412..fe1ec9014f188 100644
--- a/llvm/test/Transforms/InstCombine/binop-select.ll
+++ b/llvm/test/Transforms/InstCombine/binop-select.ll
@@ -335,7 +335,7 @@ define i32 @sub_sel_op1_use(i1 %b) {
 
 define float @fadd_sel_op0(i1 %b, float %x) {
 ; CHECK-LABEL: @fadd_sel_op0(
-; CHECK-NEXT:    [[R:%.*]] = select nnan i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %s = select i1 %b, float 0xFFF0000000000000, float 0x7FF0000000000000
@@ -403,3 +403,188 @@ define i32 @ashr_sel_op1_use(i1 %b) {
   %r = ashr i32 -2, %s
   ret i32 %r
 }
+
+define i8 @commonArgWithOr0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr0(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8
+; CHECK-NEXT:    [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithOr1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr1(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 23
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithOr2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr2(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 42
+; CHECK-NEXT:    [[V2:%.*]] = or i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 42
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd0(
+; CHECK-NEXT:    ret i8 16
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd1(
+; CHECK-NEXT:    ret i8 16
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 8, i8 1
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd2(
+; CHECK-NEXT:    [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd3(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd3(
+; CHECK-NEXT:    [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 42
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor0(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8
+; CHECK-NEXT:    [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor1(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 9, i8 1
+; CHECK-NEXT:    [[V2:%.*]] = xor i8 [[V1]], [[V0]]
+; CHECK-NEXT:    ret i8 [[V2]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 9, i8 1
+  %v2 = xor i8 %v1, %v0
+  ret i8 %v2
+}
+
+define i8 @commonArgWithXor2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor2(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 1, i8 7
+; CHECK-NEXT:    [[V2:%.*]] = xor i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor3(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor3(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 45
+; CHECK-NEXT:    [[V2:%.*]] = xor i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 45
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAdd0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAdd0(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 22, i8 61
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 45
+  %v2 = add i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i32 @OrSelectIcmpZero(i32 %a, i32 %b) {
+; CHECK-LABEL: @OrSelectIcmpZero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+define i32 @OrSelectIcmpNonZero(i32 %a, i32 %b) {
+; CHECK-LABEL: @OrSelectIcmpNonZero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 42
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 42
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
diff --git a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
index 45e47d8e781be..5e90d4b8d4419 100644
--- a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
+++ b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
@@ -7,7 +7,7 @@
 define zeroext i1 @foo(i32 %arg) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]]
 ; CHECK:       bb_then:
 ; CHECK-NEXT:    call void @bar()
@@ -16,8 +16,7 @@ define zeroext i1 @foo(i32 %arg) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17
 ; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_exit:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ]
-; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]]
+; CHECK-NEXT:    [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[AND1]]
 ;
 
@@ -43,7 +42,7 @@ bb_exit:
 define zeroext i1 @foo_logical(i32 %arg) {
 ; CHECK-LABEL: @foo_logical(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]]
 ; CHECK:       bb_then:
 ; CHECK-NEXT:    call void @bar()
@@ -52,8 +51,7 @@ define zeroext i1 @foo_logical(i32 %arg) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17
 ; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_exit:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ]
-; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]]
+; CHECK-NEXT:    [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[AND1]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index cd4a8e36c6e23..3cbf7090a13b8 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1222,7 +1222,7 @@ define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) {
 ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
 define float @fmul_select(float %x, i1 %c) {
 ; CHECK-LABEL: @fmul_select(
-; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[MUL]]
 ;
   %sel = select i1 %c, float 1.0, float 0.0
@@ -1233,7 +1233,7 @@ define float @fmul_select(float %x, i1 %c) {
 ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
 define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) {
 ; CHECK-LABEL: @fmul_select_vec(
-; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
   %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 4b69a5e77b4ce..2e8e75c3ab3ef 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -563,10 +563,10 @@ define i1 @test_inv_free(i1 %c1, i1 %c2, i1 %c3, i1 %c4) {
 ; CHECK:       b2:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       b3:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[C3:%.*]], [[C4:%.*]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[VAL_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ true, [[B2]] ], [ [[C3:%.*]], [[B3]] ]
-; CHECK-NEXT:    [[COND_NOT:%.*]] = and i1 [[VAL_NOT]], [[C4:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ [[C4]], [[B2]] ], [ [[TMP0]], [[B3]] ]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[B5:%.*]], label [[B4:%.*]]
 ; CHECK:       b4:
 ; CHECK-NEXT:    ret i1 true
diff --git a/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll
new file mode 100644
index 0000000000000..a3b21ccc63e94
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Basic functional test
+define i32 @basic(i32 %a, i32 %b) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Operand order swap test
+define i32 @swap_operand_order(i32 %x, i32 %y) {
+; CHECK-LABEL: @swap_operand_order(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %y, i32 0
+  %or = or i32 %x, %sel
+  ret i32 %or
+}
+
+; Negative test: Non-zero false value in select
+define i32 @negative_non_zero_false_val(i32 %a, i32 %b) {
+; CHECK-LABEL: @negative_non_zero_false_val(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 1
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Negative test: Incorrect comparison predicate (NE)
+define i32 @negative_wrong_predicate(i32 %a, i32 %b) {
+; CHECK-LABEL: @negative_wrong_predicate(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i32 0, i32 [[TMP1:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[OR]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Comparison direction swap test (0 == X)
+define i32 @cmp_swapped(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 0, %x
+  %sel = select i1 %cmp, i32 %y, i32 0
+  %or = or i32 %x, %sel
+  ret i32 %or
+}
+
+; Complex expression test
+define i32 @complex_expression(i32 %a, i32 %b) {
+; CHECK-LABEL: @complex_expression(
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[X]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %x = add i32 %a, 1
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %x
+  ret i32 %or
+}
+
+; zext test
+define i32 @zext_cond(i8 %a, i32 %b) {
+; CHECK-LABEL: @zext_cond(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[Z]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %z = zext i8 %a to i32
+  %cmp = icmp eq i8 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %z
+  ret i32 %or
+}
+
+; sext test
+define i32 @sext_cond(i8 %a, i32 %b) {
+; CHECK-LABEL: @sext_cond(
+; CHECK-NEXT:    [[S:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[S]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %s = sext i8 %a to i32
+  %cmp = icmp eq i8 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %s
+  ret i32 %or
+}
+
+; Vector type test
+define <2 x i32> @vector_type(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @vector_type(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[B:%.*]], <2 x i32> [[A]]
+; CHECK-NEXT:    ret <2 x i32> [[RES]]
+;
+  %cmp = icmp eq <2 x i32> %a, zeroinitializer
+  %sel = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> zeroinitializer
+  %or = or <2 x i32> %sel, %a
+  ret <2 x i32> %or
+}
+
+; Pointer type test (should not trigger optimization)
+define ptr @pointer_type(ptr %p, ptr %q) {
+; CHECK-LABEL: @pointer_type(
+; CHECK-NEXT:    [[A:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], null
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], ptr [[Q:%.*]], ptr null
+; CHECK-NEXT:    [[SEL_INT:%.*]] = ptrtoint ptr [[SEL]] to i64
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[A]], [[SEL_INT]]
+; CHECK-NEXT:    [[RET:%.*]] = inttoptr i64 [[OR]] to ptr
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %a = ptrtoint ptr %p to i64
+  %cmp = icmp eq i64 %a, 0
+  %sel = select i1 %cmp, ptr %q, ptr null
+  %sel_int = ptrtoint ptr %sel to i64
+  %or_val = or i64 %a, %sel_int
+  %ret = inttoptr i64 %or_val to ptr
+  ret ptr %ret
+}
+
+; Multi-use test (should not trigger optimization)
+define i32 @multi_use_test(i32 %x, i32 %m) {
+; CHECK-LABEL: @multi_use_test(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[M:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[X]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEL]], [[X]]
+; CHECK-NEXT:    [[O2:%.*]] = sub i32 [[OR]], [[ADD]]
+; CHECK-NEXT:    ret i32 [[O2]]
+;
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %m, i32 0
+  %or = or i32 %sel, %x
+  %add = add i32 %sel, %x
+  %res = sub i32 %or, %add
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/recurrence.ll b/llvm/test/Transforms/InstCombine/recurrence.ll
index f75e0d439c572..643e7efc243a3 100644
--- a/llvm/test/Transforms/InstCombine/recurrence.ll
+++ b/llvm/test/Transforms/InstCombine/recurrence.ll
@@ -24,9 +24,9 @@ loop:                                             ; preds = %loop, %entry
 define i64 @test_or2(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_or2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    tail call void @use(i64 [[IV_NEXT]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -104,9 +104,9 @@ loop:                                             ; preds = %loop, %entry
 define i64 @test_and2(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_and2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    tail call void @use(i64 [[IV_NEXT]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index ee70137e8fbd7..01da63fa5b0af 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -858,8 +858,7 @@ define i1 @_gep_phi2(ptr %str1, i64 %val2) {
 ; CHECK:       while.end.i:
 ; CHECK-NEXT:    br label [[_Z3FOOPKC_EXIT]]
 ; CHECK:       _Z3fooPKc.exit:
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ 0, [[LOR_LHS_FALSE_I]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[RETVAL_0_I]], [[VAL2:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ [[VAL2:%.*]], [[LOR_LHS_FALSE_I]] ], [ [[VAL2]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TOBOOL]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
index 55adda7d5b0f3..08191c636bd3f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
@@ -18,45 +18,45 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT19]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD18]] to <4 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD19]] to <4 x double>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT20]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT20]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT15]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp fast ogt <4 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI18]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP17]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX21]])
+; CHECK-NEXT:    [[BIN_RDX20:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX20]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER22]]
 ; CHECK:       [[FOR_BODY_PREHEADER22]]:
@@ -65,11 +65,11 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[V0_010_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ]
 ; CHECK-NEXT:    [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_011_PH]], %[[FOR_BODY_PREHEADER22]] ]
 ; CHECK-NEXT:    [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_010_PH]], %[[FOR_BODY_PREHEADER22]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP0]] to double
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]]
@@ -79,16 +79,16 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]]
 ; CHECK-NEXT:    [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
-; CHECK-NEXT:    [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]]
+; CHECK-NEXT:    [[V0_1_LCSSA:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1_LCSSA:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd fast double [[V1_1_LCSSA]], [[V0_1_LCSSA]]
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24]], %[[FOR_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret double [[ADD5]]
 ;
 entry:
@@ -193,29 +193,29 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT35]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT29]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
 ; CHECK:       [[FOR_BODY_US]]:
-; CHECK-NEXT:    [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT:    [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V1_019_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V0_018_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ]
 ; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_021_US]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_020_US]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_019_US]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_018_US]], i64 0
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI31:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <4 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI33:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fpext <4 x float> [[WIDE_LOAD34]] to <4 x double>
@@ -223,8 +223,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP7:%.*]] = tail call fast <4 x double> @llvm.exp2.v4f64(<4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x double> [[TMP6]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x double> [[TMP7]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT36]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT36]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT30]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT30]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp fast ogt <4 x double> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]]
@@ -237,26 +237,26 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX37]])
+; CHECK-NEXT:    [[BIN_RDX35:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX35]])
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]]
 ; CHECK:       [[FOR_BODY3_US_PREHEADER]]:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[V1_114_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[V0_113_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V1_114_US_PH:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V0_113_US_PH:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY3_US:.*]]
 ; CHECK:       [[FOR_BODY3_US]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_114_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_113_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US1]], align 4
 ; CHECK-NEXT:    [[CONV_US:%.*]] = fpext float [[TMP0]] to double
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]])
 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]]
@@ -267,7 +267,7 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]]
 ; CHECK-NEXT:    [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]:
@@ -275,17 +275,18 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ]
 ; CHECK-NEXT:    [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]]
-; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10_LOOPEXIT:.*]], label %[[FOR_BODY_US]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
 ; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END10_LOOPEXIT]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast double [[V1_2_US_LCSSA]], [[V0_2_US_LCSSA]]
+; CHECK-NEXT:    br label %[[FOR_END10]]
 ; CHECK:       [[FOR_END10]]:
-; CHECK-NEXT:    [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]]
+; CHECK-NEXT:    [[ADD11:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29]], %[[FOR_END10_LOOPEXIT]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
 ; CHECK-NEXT:    ret double [[ADD11]]
 ;
 entry:

From 8eacea993408c27287f985f27376501442b4dfc6 Mon Sep 17 00:00:00 2001
From: Andi Drebes <47449897+andidr@users.noreply.github.com>
Date: Wed, 5 Nov 2025 10:05:06 +0100
Subject: [PATCH 290/313] [MLIR][ODS] Re-enable direct implementation of type
 interfaces with method bodies (#166335)

Since commit 842622bf8bea782e9d9865ed78b0d8643f098122 adding support for
overloading interface methods, a `using` directive is emitted for any
interface method that does not require emission of a trait method,
including for methods that define a method body.

However, methods directly specifying a body (e.g., via the `methodBody`
parameter of `InterfaceMethod`) are implemented directly in the
interface class and are therefore not present in the associated trait.
The generated `using` directive then referes to a non-existent method of
the trait, resulting in an error upon compilation of the generated code.

This patch changes `DefGen::emitTraitMethods()`, such that
`genTraitMethodUsingDecl()` is not invoked for interface methods with a
body anymore.
---
 mlir/test/lib/Dialect/Test/TestTypeDefs.td  | 7 +++++++
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 6 ++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index ea20597231d58..9859bd06cb526 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -470,4 +470,11 @@ def TestMemrefType : Test_Type<"TestMemref",
   }];
 }
 
+// Test implementation of an interface with methods specifying a
+// method body
+def TestBaseBody : Test_Type<"TestBaseBody",
+    [DeclareTypeInterfaceMethods<TestBaseTypeInterfacePrintTypeA>]> {
+  let mnemonic = "test_base_body";
+}
+
 #endif // TEST_TYPEDEFS
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 8ec2e03095423..2a513c3b8cc9b 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -637,8 +637,10 @@ void DefGen::emitTraitMethods(const InterfaceTrait &trait) {
   for (auto &method : iface.getMethods()) {
     // Don't declare if the method has a body. Or if the method has a default
     // implementation and the def didn't request that it always be declared.
-    if (method.getBody() || (method.getDefaultImplementation() &&
-                             !alwaysDeclared.count(method.getName()))) {
+    if (method.getBody())
+      continue;
+    if (method.getDefaultImplementation() &&
+        !alwaysDeclared.count(method.getName())) {
       genTraitMethodUsingDecl(trait, method);
       continue;
     }

From 7b3fe5fd42f1178f98879642e4ed4bd23a74cfe6 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Wed, 5 Nov 2025 09:18:02 +0000
Subject: [PATCH 291/313] [LV][NFC] Remove undef values in some test cases
 (#164401)

Split off from PR #163525, this standalone patch replaces simple cases
where undef is used as a value for arithmetic or getelementptr
instructions. This will reduce the likelihood of contributors hitting
the `undef deprecator` warning in github.
---
 .../X86/consecutive-ptr-cg-bug.ll             | 12 +--
 .../LoopVectorize/X86/cost-model-assert.ll    | 20 ++---
 .../X86/interleaved-accesses-large-gap.ll     | 12 +--
 .../demanded-bits-of-pointer-instruction.ll   |  4 +-
 .../LoopVectorize/if-pred-stores.ll           | 18 ++--
 .../Transforms/LoopVectorize/nsw-crash.ll     |  4 +-
 llvm/test/Transforms/LoopVectorize/optsize.ll | 88 +++++++++++--------
 llvm/test/Transforms/LoopVectorize/pr32859.ll |  4 +-
 llvm/test/Transforms/LoopVectorize/pr36311.ll | 13 ++-
 .../Transforms/LoopVectorize/reduction-ptr.ll |  4 +-
 .../LoopVectorize/reduction-small-size.ll     | 26 ++++--
 .../LoopVectorize/runtime-drop-crash.ll       |  4 +-
 .../LoopVectorize/undef-inst-bug.ll           | 10 +--
 13 files changed, 121 insertions(+), 98 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
index e6b74062ad765..a33f8eb920039 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
@@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This test was originally vectorized, but now SCEV is smart enough to prove
 ; that its trip count is 1, so it gets ignored by vectorizer.
 ; Function Attrs: uwtable
-define void @test_01(i1 %arg) {
+define void @test_01(ptr addrspace(1) %p, i1 %arg) {
   br label %.outer
 
 ; <label>:1:                                      ; preds = %2
@@ -57,8 +57,8 @@ define void @test_01(i1 %arg) {
   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
   %9 = add i32 %8, 2
   %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10
-  %12 = ashr i32 undef, %4
+  %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10
+  %12 = ashr i32 12, %4
   store i32 %12, ptr addrspace(1) %11, align 4
   %13 = add i32 %7, 1
   %14 = icmp sgt i32 %13, 61
@@ -74,7 +74,7 @@ define void @test_01(i1 %arg) {
 ; CHECK: store <4 x i32>
 
 ; Function Attrs: uwtable
-define void @test_02(i1 %arg) {
+define void @test_02(ptr addrspace(1) %p, i1 %arg) {
   br label %.outer
 
 ; <label>:1:                                      ; preds = %2
@@ -96,8 +96,8 @@ define void @test_02(i1 %arg) {
   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
   %9 = add i32 %8, 2
   %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10
-  %12 = ashr i32 undef, %4
+  %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10
+  %12 = ashr i32 12, %4
   store i32 %12, ptr addrspace(1) %11, align 4
   %13 = add i32 %7, 1
   %14 = icmp sgt i32 %13, 610
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
index 4cff8753ba9b1..239366c59470e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -11,9 +11,9 @@
 target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-w64-windows-gnu"
 
-define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
+define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p, ptr %pend) #0 {
 ; CHECK-LABEL: define void @cff_index_load_offsets(
-; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]], ptr [[PEND:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]]
 ; CHECK:       [[IF_THEN]]:
@@ -26,14 +26,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
 ; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]]
-; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1]]
+; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 12, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
 ; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
-; CHECK-NEXT:    store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i32 [[OR83]], ptr [[P]], align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4
-; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef
+; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], [[PEND]]
 ; CHECK-NEXT:    br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]]
 ; CHECK:       [[SW_EPILOG]]:
 ; CHECK-NEXT:    unreachable
@@ -54,14 +54,14 @@ for.body68:                                       ; preds = %for.body68, %if.the
   %conv73 = zext i8 %0 to i32
   %shl74 = shl nuw nsw i32 %conv73, 16
   %or75 = or i32 %shl74, %shl71
-  %1 = load i8, ptr undef, align 1, !tbaa !1
-  %shl78 = shl nuw nsw i32 undef, 8
+  %1 = load i8, ptr %p, align 1, !tbaa !1
+  %shl78 = shl nuw nsw i32 12, 8
   %or79 = or i32 %or75, %shl78
   %conv81 = zext i8 %1 to i32
   %or83 = or i32 %or79, %conv81
-  store i32 %or83, ptr undef, align 4, !tbaa !4
+  store i32 %or83, ptr %p, align 4, !tbaa !4
   %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4
-  %cmp66 = icmp ult ptr %add.ptr86, undef
+  %cmp66 = icmp ult ptr %add.ptr86, %pend
   br i1 %cmp66, label %for.body68, label %sw.epilog
 
 sw.epilog:                                        ; preds = %for.body68
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
index e75d469506376..acec9e47a94ee 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
@@ -41,8 +41,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 ; Make sure interleave groups with a key being the special 'empty' value for
 ; the map do not cause a crash.
-define void @test_gap_empty_key() {
-; CHECK-LABEL: @test_gap_empty_key()
+define void @test_gap_empty_key(ptr %p) {
+; CHECK-LABEL: @test_gap_empty_key(ptr %p)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
 
@@ -57,7 +57,7 @@ entry:
 for.body:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
   %iv.next = add nsw i64 %iv, 1
-  %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next
+  %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next
   %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next
   %G9 = getelementptr i32, ptr %G2, i32 -2147483647
   store i32 0, ptr %G2
@@ -71,8 +71,8 @@ exit:
 
 ; Make sure interleave groups with a key being the special 'tombstone' value for
 ; the map do not cause a crash.
-define void @test_tombstone_key() {
-; CHECK-LABEL: @test_tombstone_key()
+define void @test_tombstone_key(ptr %p) {
+; CHECK-LABEL: @test_tombstone_key(ptr %p)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
 
@@ -87,7 +87,7 @@ entry:
 for.body:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
   %iv.next = add nsw i64 %iv, 1
-  %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next
+  %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next
   %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next
   %G9 = getelementptr i32, ptr %G2, i32 -2147483648
   store i32 0, ptr %G2
diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
index 41756ffb64e6c..8744e45344242 100644
--- a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
@@ -4,13 +4,13 @@
 ; Only make sure we do not crash.
 
 ; CHECK: @test
-define void @test(ptr %ptr, ptr %ptr_end) {
+define void @test(i8 %v, ptr %ptr, ptr %ptr_end) {
 start:
   br label %loop
 
 loop:
   %ptr2 = phi ptr [ %ptr3, %loop ], [ %ptr, %start ]
-  %x = sext i8 undef to i64
+  %x = sext i8 %v to i64
   %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1
   %cmp = icmp ult ptr %ptr3, %ptr_end
   br i1 %cmp, label %loop, label %end
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c164c4a46bd94..e7913c583b938 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -384,15 +384,15 @@ for.inc26:
 ; conditional store to remain scalar. Since we can only type-shrink vector
 ; types, we shouldn't try to represent the expression in a smaller type.
 ;
-define void @minimal_bit_widths(i1 %c) {
+define void @minimal_bit_widths(ptr %p, i1 %c) {
 ; UNROLL-LABEL: @minimal_bit_widths(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
 ; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
@@ -415,8 +415,8 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -442,16 +442,16 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
-; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP8]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
 ; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
 ; VEC-NEXT:    store i8 [[TMP4]], ptr [[TMP3]], align 1
 ; VEC-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]]
 ; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
 ; VEC-NEXT:    store i8 [[TMP7]], ptr [[TMP6]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
@@ -468,7 +468,7 @@ entry:
 for.body:
   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ]
-  %tmp2 = getelementptr i8, ptr undef, i64 %tmp0
+  %tmp2 = getelementptr i8, ptr %p, i64 %tmp0
   %tmp3 = load i8, ptr %tmp2, align 1
   br i1 %c, label %if.then, label %for.inc
 
diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
index 106054d989776..d87d9b155db1c 100644
--- a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
@@ -3,7 +3,7 @@
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define void @test(i1 %arg) {
+define void @test(ptr %p, i1 %arg) {
 entry:
   br i1 %arg, label %while.end, label %while.body.lr.ph
 
@@ -11,7 +11,7 @@ while.body.lr.ph:
   br label %while.body
 
 while.body:
-  %it.sroa.0.091 = phi ptr [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %it.sroa.0.091 = phi ptr [ %p, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
   %incdec.ptr.i = getelementptr inbounds i32, ptr %it.sroa.0.091, i64 1
   %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
   %cmp.i11 = icmp eq ptr %incdec.ptr.i, undef
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 763072ab16f73..f9f7feb7bdfbc 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -248,25 +248,27 @@ for.end:                                          ; preds = %for.body
 ;
 @cm_array = external global [2592 x i16], align 1
 
-define void @pr43371() optsize {
+define void @pr43371(i16 %val) optsize {
 ;
 ; CHECK-LABEL: define void @pr43371(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -277,22 +279,24 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    unreachable
 ;
 ; PGSO-LABEL: define void @pr43371(
-; PGSO-SAME: ) #[[ATTR0]] {
+; PGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; PGSO-NEXT:  [[ENTRY:.*:]]
 ; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -303,22 +307,24 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    unreachable
 ;
 ; NPGSO-LABEL: define void @pr43371(
-; NPGSO-SAME: ) #[[ATTR0]] {
+; NPGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; NPGSO-NEXT:  [[ENTRY:.*:]]
 ; NPGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; NPGSO:       [[VECTOR_PH]]:
+; NPGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; NPGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; NPGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; NPGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; NPGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; NPGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; NPGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -340,7 +346,7 @@ for.cond.cleanup28:
 
 for.body29:
   %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
-  %add33 = add i16 undef, %i24.0170
+  %add33 = add i16 %val, %i24.0170
   %idxprom34 = zext i16 %add33 to i32
   %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34
   store i16 0, ptr %arrayidx35, align 1
@@ -349,25 +355,27 @@ for.body29:
   br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
 }
 
-define void @pr43371_pgso() !prof !14 {
+define void @pr43371_pgso(i16 %val) !prof !14 {
 ;
 ; CHECK-LABEL: define void @pr43371_pgso(
-; CHECK-SAME: ) !prof [[PROF14]] {
+; CHECK-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -378,22 +386,24 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    unreachable
 ;
 ; PGSO-LABEL: define void @pr43371_pgso(
-; PGSO-SAME: ) !prof [[PROF14]] {
+; PGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; PGSO-NEXT:  [[ENTRY:.*:]]
 ; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -404,17 +414,19 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    unreachable
 ;
 ; NPGSO-LABEL: define void @pr43371_pgso(
-; NPGSO-SAME: ) !prof [[PROF14]] {
+; NPGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; NPGSO-NEXT:  [[ENTRY:.*:]]
 ; NPGSO-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
 ; NPGSO:       [[VECTOR_SCEVCHECK]]:
-; NPGSO-NEXT:    br i1 undef, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NPGSO-NEXT:    [[TMP0:%.*]] = add i16 [[VAL]], 755
+; NPGSO-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP0]], [[VAL]]
+; NPGSO-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NPGSO:       [[VECTOR_PH]]:
 ; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]]
+; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 [[VAL]], [[OFFSET_IDX]]
 ; NPGSO-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1
@@ -429,7 +441,7 @@ define void @pr43371_pgso() !prof !14 {
 ; NPGSO-NEXT:    unreachable
 ; NPGSO:       [[FOR_BODY29]]:
 ; NPGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
+; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 [[VAL]], [[I24_0170]]
 ; NPGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
 ; NPGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
 ; NPGSO-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
@@ -449,7 +461,7 @@ for.cond.cleanup28:
 
 for.body29:
   %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
-  %add33 = add i16 undef, %i24.0170
+  %add33 = add i16 %val, %i24.0170
   %idxprom34 = zext i16 %add33 to i32
   %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34
   store i16 0, ptr %arrayidx35, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index 2d30e0c9ad10f..f65e9cab1700b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -10,13 +10,13 @@
 ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
 
 ; Function Attrs: nounwind uwtable
-define void @main(i32 %n) #0 {
+define void @main(i32 %n, i32 %v) #0 {
 entry:
   br label %for.cond1.preheader.i
 
 for.cond1.preheader.i:                            ; preds = %if.end.2.i, %entry
   %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
-  %tobool.i = icmp ne i32 undef, 0
+  %tobool.i = icmp ne i32 %v, 0
   br label %if.end.2.i
 
 if.end.2.i:                                       ; preds = %for.cond1.preheader.i
diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll
index f2dfecc341e6f..bc27e4e9b09cd 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36311.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll
@@ -10,10 +10,7 @@
 
 $test = comdat any
 
-declare i32 @__gxx_personality_v0(...)
-
-; Function Attrs: uwtable
-define dso_local void @test(i1 %arg) local_unnamed_addr #0 comdat align 2 personality ptr @__gxx_personality_v0 {
+define void @test(ptr %p, i1 %arg) {
 entry:
   br label %for.body51
 
@@ -26,9 +23,9 @@ for.cond80.loopexit:                              ; preds = %for.body89
 
 for.body89.lr.ph:                                 ; preds = %for.cond80.loopexit, %for.body51
   %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ]
-  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ]
+  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ 0, %for.body51 ]
   %add90 = add nuw i32 %i79.0179, 1
-  %mul91 = mul i32 %add90, undef
+  %mul91 = mul i32 %add90, 7
   br label %for.body89
 
 for.body89:                                       ; preds = %for.body89, %for.body89.lr.ph
@@ -38,10 +35,10 @@ for.body89:                                       ; preds = %for.body89, %for.bo
   %add93 = add i32 %add92, %mul91
   %inc94 = add i32 %next_index.5174, 1
   %conv95 = zext i32 %next_index.5174 to i64
-  %arrayidx.i160 = getelementptr inbounds i32, ptr undef, i64 %conv95
+  %arrayidx.i160 = getelementptr inbounds i32, ptr %p, i64 %conv95
   store i32 %add93, ptr %arrayidx.i160, align 4
 ;, !tbaa !1
-  %cmp87 = icmp ult i32 %add92, undef
+  %cmp87 = icmp ult i32 %add92, 123
   br i1 %cmp87, label %for.body89, label %for.cond80.loopexit
 
 nrvo.skipdtor.loopexit:                           ; preds = %for.cond80.loopexit
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
index 0656cd2b2aa94..0fdc8fd6ad519 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
@@ -15,7 +15,7 @@ define void @PR49215(ptr %p, ptr %q) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[Q:%.*]], [[G]]
 ; CHECK-NEXT:    [[UMIN]] = select i1 [[CMP2]], ptr [[Q]], ptr [[G]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 123
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    [[UMIN_LCSSA:%.*]] = phi ptr [ [[UMIN]], [[FOR_BODY]] ]
@@ -31,7 +31,7 @@ for.body:
   %cmp2 = icmp ult ptr %q, %g
   %umin = select i1 %cmp2, ptr %q, ptr %g
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, undef
+  %exitcond = icmp eq i64 %iv.next, 123
   br i1 %exitcond, label %loopexit, label %for.body
 
 loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 13cc1b657d231..f01e562fe40c7 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
+define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK-LABEL: @PR34687(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
@@ -13,20 +13,30 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -36,17 +46,19 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ]
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[T0:%.*]] = sdiv i32 undef, undef
+; CHECK-NEXT:    [[T0:%.*]] = sdiv i32 [[I]], [[X]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[T0]], [[IF_THEN]] ]
 ; CHECK-NEXT:    [[T1:%.*]] = and i32 [[R]], 255
 ; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
-; CHECK-NEXT:    [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X]]
+; CHECK-NEXT:    [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X1]]
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
+; CHECK-NEXT:    [[DIV_USE:%.*]] = phi i32 [ [[DIV_PHI]], [[IF_END]] ], [ [[PREDPHI]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[T3:%.*]] = trunc i32 [[T2]] to i8
 ; CHECK-NEXT:    ret i8 [[T3]]
@@ -60,10 +72,11 @@ for.body:
   br i1 %c, label %if.then, label %if.end
 
 if.then:
-  %t0 = sdiv i32 undef, undef
+  %t0 = sdiv i32 %i, %divisor
   br label %if.end
 
 if.end:
+  %div_phi = phi i32 [ 0, %for.body ], [ %t0, %if.then ]
   %t1 = and i32 %r, 255
   %i.next = add nsw i32 %i, 1
   %r.next = add nuw nsw i32 %t1, %x
@@ -71,6 +84,7 @@ if.end:
   br i1 %cond, label %for.end, label %for.body
 
 for.end:
+  %div_use = phi i32 [ %div_phi, %if.end ]
   %t2 = phi i32 [ %r.next, %if.end ]
   %t3 = trunc i32 %t2 to i8
   ret i8 %t3
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
index c76c2c0ef47a2..ab10d62bc6048 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
@@ -12,12 +12,12 @@ entry:
 
 loop:
   %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ]
-  %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr, i64 undef
+  %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr
   store i64 0, ptr %tmp4, align 8
   %tmp8 = add i64 1, %tmp3
   %tmp10 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp8
   store i64 1, ptr %tmp10, align 8
-  %tmp14 = add i64 undef, %tmp3
+  %tmp14 = add i64 3, %tmp3
   %tmp16 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp14
   store i64 2, ptr %tmp16, align 8
   %tmp18 = add nuw nsw i64 %tmp3, 4
diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
index 1fccf546f4a67..d3cd80beaae90 100644
--- a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-LABEL: @t(
 ; CHECK: <4 x i32>
 
-define void @t() {
+define void @t(ptr %p) {
 entry:
   br label %for.body
 
@@ -22,13 +22,13 @@ for.body:
   %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ]
 
   ; Loop invariant anchored in loop.
-  %idxprom21 = zext i32 undef to i64
+  %idxprom21 = zext i32 0 to i64
 
-  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr undef, i64 0, i64 %idxprom21, i64 %indvars.iv17
-  store i32 undef, ptr %arrayidx23, align 4
+  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr %p, i64 0, i64 %idxprom21, i64 %indvars.iv17
+  store i32 poison, ptr %arrayidx23, align 4
   %indvars.next= add i64 %indvars.iv17, -1
   %0 = trunc i64 %indvars.next to i32
-  %cmp15 = icmp ugt i32 %0, undef
+  %cmp15 = icmp ugt i32 %0, poison
   br i1 %cmp15, label %for.body, label %loopexit
 
 loopexit:

From 0314b939d69597e16e08bb31587fa53c1bea75cb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 5 Nov 2025 09:56:20 +0000
Subject: [PATCH 292/313] [Headers][X86] avx ifma - move constexpr to the end
 of the function attribute lists. NFC. (#166523)

Makes it easier to compare constexpr/non-constexpr attribute defines

Allows clang-format to pack the attributes more efficiently
---
 clang/lib/Headers/avx512ifmaintrin.h   |  5 ++---
 clang/lib/Headers/avx512ifmavlintrin.h | 13 ++++++-------
 clang/lib/Headers/avxifmaintrin.h      |  8 ++++----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
index 625a8ff66dc60..f73b607df797f 100644
--- a/clang/lib/Headers/avx512ifmaintrin.h
+++ b/clang/lib/Headers/avx512ifmaintrin.h
@@ -17,9 +17,8 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS                                                     \
-  constexpr                                                                    \
-      __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
-                     __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
+                 __min_vector_width__(512))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index b377c17166ffb..51d5210e5aa5d 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -18,13 +18,13 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512ifma,avx512vl"),                  \
-                           __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl"),                            \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512ifma,avx512vl"),                  \
-                           __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl"),                            \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
@@ -34,7 +34,6 @@
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512ifma,avx512vl"),                            \
                  __min_vector_width__(256)))
-
 #endif
 
 #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__))
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index e452d5f0920e9..30df01caed6cf 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -17,11 +17,11 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avxifma"), __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avxifma"), __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \

From cc9ad9afc6d22f9955a35aa8f62f2bbf11109673 Mon Sep 17 00:00:00 2001
From: NagaChaitanya Vellanki <pnagato@protonmail.com>
Date: Wed, 5 Nov 2025 01:58:30 -0800
Subject: [PATCH 293/313] [Clang] Add constexpr support for AVX512 permutex2
 intrinsics (#165085)

This patch enables compile-time evaluation of AVX512 permutex2var
intrinsics in constexpr contexts.

Extend shuffle generic to handle both integer immediate and vector mask
operands.

Resolves #161335
---
 clang/include/clang/Basic/BuiltinsX86.td      |  61 ++-----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  88 +++++++++-
 clang/lib/AST/ExprConstant.cpp                | 111 ++++++++++++-
 clang/lib/Headers/avx10_2_512bf16intrin.h     |   9 +-
 clang/lib/Headers/avx10_2bf16intrin.h         |  15 +-
 clang/lib/Headers/avx512bwintrin.h            |  20 +--
 clang/lib/Headers/avx512fintrin.h             |  85 ++++------
 clang/lib/Headers/avx512fp16intrin.h          |   4 +-
 clang/lib/Headers/avx512vbmiintrin.h          |  48 +++---
 clang/lib/Headers/avx512vbmivlintrin.h        |  86 +++++-----
 clang/lib/Headers/avx512vlbwintrin.h          |  44 ++---
 clang/lib/Headers/avx512vlfp16intrin.h        |   8 +-
 clang/lib/Headers/avx512vlintrin.h            |  77 ++++-----
 clang/test/CodeGen/X86/avx512bw-builtins.c    | 153 +++++++++++++++++
 clang/test/CodeGen/X86/avx512f-builtins.c     | 120 ++++++++++++++
 clang/test/CodeGen/X86/avx512vbmi-builtins.c  | 154 ++++++++++++++++++
 clang/test/CodeGen/X86/avx512vbmivl-builtin.c |  70 +++++++-
 clang/test/CodeGen/X86/avx512vl-builtins.c    | 128 +++++++++++++++
 clang/test/CodeGen/X86/avx512vlbw-builtins.c  |  80 +++++++++
 19 files changed, 1096 insertions(+), 265 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 9e877b92eac68..4388c09423a21 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1765,75 +1765,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
   def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
   def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
   def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
 }
 
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8b57b963c538f..9991e365addb8 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3415,18 +3415,46 @@ static bool interp__builtin_ia32_shuffle_generic(
         GetSourceIndex) {
 
   assert(Call->getNumArgs() == 3);
-  unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+
+  unsigned ShuffleMask = 0;
+  Pointer A, MaskVector, B;
+
+  QualType Arg2Type = Call->getArg(2)->getType();
+  bool IsVectorMask = false;
+  if (Arg2Type->isVectorType()) {
+    IsVectorMask = true;
+    B = S.Stk.pop<Pointer>();
+    MaskVector = S.Stk.pop<Pointer>();
+    A = S.Stk.pop<Pointer>();
+  } else if (Arg2Type->isIntegerType()) {
+    ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+    B = S.Stk.pop<Pointer>();
+    A = S.Stk.pop<Pointer>();
+  } else {
+    return false;
+  }
 
   QualType Arg0Type = Call->getArg(0)->getType();
   const auto *VecT = Arg0Type->castAs<VectorType>();
   PrimType ElemT = *S.getContext().classify(VecT->getElementType());
   unsigned NumElems = VecT->getNumElements();
 
-  const Pointer &B = S.Stk.pop<Pointer>();
-  const Pointer &A = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
+  PrimType MaskElemT = PT_Uint32;
+  if (IsVectorMask) {
+    QualType Arg1Type = Call->getArg(1)->getType();
+    const auto *MaskVecT = Arg1Type->castAs<VectorType>();
+    QualType MaskElemType = MaskVecT->getElementType();
+    MaskElemT = *S.getContext().classify(MaskElemType);
+  }
+
   for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+    if (IsVectorMask) {
+      INT_TYPE_SWITCH(MaskElemT, {
+        ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
+      });
+    }
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
@@ -4434,6 +4462,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
             return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
           }
         });
+  case X86::BI__builtin_ia32_vpermi2varq128:
+  case X86::BI__builtin_ia32_vpermi2varpd128:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x1;
+          unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2vard128:
+  case X86::BI__builtin_ia32_vpermi2varps128:
+  case X86::BI__builtin_ia32_vpermi2varq256:
+  case X86::BI__builtin_ia32_vpermi2varpd256:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x3;
+          unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varhi128:
+  case X86::BI__builtin_ia32_vpermi2vard256:
+  case X86::BI__builtin_ia32_vpermi2varps256:
+  case X86::BI__builtin_ia32_vpermi2varq512:
+  case X86::BI__builtin_ia32_vpermi2varpd512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x7;
+          unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi128:
+  case X86::BI__builtin_ia32_vpermi2varhi256:
+  case X86::BI__builtin_ia32_vpermi2vard512:
+  case X86::BI__builtin_ia32_vpermi2varps512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0xF;
+          unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi256:
+  case X86::BI__builtin_ia32_vpermi2varhi512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x1F;
+          unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x3F;
+          unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 97eeba8b9d6cc..8fab6efafb983 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric(
   if (!VT)
     return false;
 
-  APSInt MaskImm;
-  if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
-    return false;
-  unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+  unsigned ShuffleMask = 0;
+  APValue A, MaskVector, B;
+  bool IsVectorMask = false;
 
-  APValue A, B;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
-      !EvaluateAsRValue(Info, Call->getArg(1), B))
+  QualType Arg2Type = Call->getArg(2)->getType();
+  if (Arg2Type->isVectorType()) {
+    IsVectorMask = true;
+    if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+        !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
+        !EvaluateAsRValue(Info, Call->getArg(2), B))
+      return false;
+  } else if (Arg2Type->isIntegerType()) {
+    APSInt MaskImm;
+    if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+      return false;
+    ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+    if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+        !EvaluateAsRValue(Info, Call->getArg(1), B))
+      return false;
+  } else {
     return false;
+  }
 
   unsigned NumElts = VT->getNumElements();
-  SmallVector<APValue, 16> ResultElements;
+  SmallVector<APValue, 64> ResultElements;
   ResultElements.reserve(NumElts);
 
   for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
+    if (IsVectorMask) {
+      ShuffleMask = static_cast<unsigned>(
+          MaskVector.getVectorElt(DstIdx).getInt().getZExtValue());
+    }
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
@@ -13080,6 +13097,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_vpermi2varq128:
+  case X86::BI__builtin_ia32_vpermi2varpd128: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x1;
+                              unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2vard128:
+  case X86::BI__builtin_ia32_vpermi2varps128:
+  case X86::BI__builtin_ia32_vpermi2varq256:
+  case X86::BI__builtin_ia32_vpermi2varpd256: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x3;
+                              unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varhi128:
+  case X86::BI__builtin_ia32_vpermi2vard256:
+  case X86::BI__builtin_ia32_vpermi2varps256:
+  case X86::BI__builtin_ia32_vpermi2varq512:
+  case X86::BI__builtin_ia32_vpermi2varpd512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x7;
+                              unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi128:
+  case X86::BI__builtin_ia32_vpermi2varhi256:
+  case X86::BI__builtin_ia32_vpermi2vard512:
+  case X86::BI__builtin_ia32_vpermi2varps512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0xF;
+                              unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi256:
+  case X86::BI__builtin_ia32_vpermi2varhi512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x1F;
+                              unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x3F;
+                              unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
   }
 }
 
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 37ebc4f46a826..46ec12a63ef9c 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                  __min_vector_width__(512)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#endif
+
 static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
   return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
 }
@@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
                                                 (__v32bf)__A);
 }
 
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
   return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                   (__v32hi)__B);
@@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
       (__v32bf)_mm512_setzero_pbh());
 }
 
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS512
 
 #endif
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 765cd682986b4..8fb8cd7cd0865 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                  __min_vector_width__(128)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
   return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
 }
@@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
                                                 (__v16bf)__A);
 }
 
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
   return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                   (__v8hi)__B);
 }
 
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
   return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                   (__v16hi)__B);
@@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 #endif
 #endif
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index ac75b6ccde735..aab1f2b61ab8a 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                  (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 18c4a44a4c76e..5fc0afa49ce4c 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 
 /* Vector permutations */
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
                                                 (__v16si) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
                                                 (__v8di) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)_mm512_setzero_si512());
@@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
                                         (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) {
   return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
                                                  (__v8df)__B);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I,
+                            __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)__A);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
-                             __m512d __B)
-{
+                             __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)(__m512d)__I);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
-                             __m512d __B)
-{
+                             __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) {
   return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
                                                 (__v16sf) __B);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I,
+                            __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)__A);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U,
+                             __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)(__m512)__I);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I,
+                             __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)_mm512_setzero_ps());
 }
 
-
 #define _mm512_cvtt_roundpd_epu32(A, R) \
   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                               (__v8si)_mm256_undefined_si256(), \
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 142cc079c2c4b..25051228f3e0a 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
                                               (__v32hf)__A);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                  (__v32hi)__B);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
 }
diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h
index 964535c4c4900..84fda5c5849e8 100644
--- a/clang/lib/Headers/avx512vbmiintrin.h
+++ b/clang/lib/Headers/avx512vbmiintrin.h
@@ -19,59 +19,57 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"),     \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
-{
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
                                                  (__v64qi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
-                              __m512i __B)
-{
+                              __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
-        __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                      (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                      (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-             __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A,
+                             __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                      (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                      (__v64qi)__W);
@@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
                                 (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
                                 (__v64qi)_mm512_setzero_si512());
 }
-
-
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS
-
 #endif
diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h
index 4c50be7d9e7e5..58a48dadff863 100644
--- a/clang/lib/Headers/avx512vbmivlintrin.h
+++ b/clang/lib/Headers/avx512vbmivlintrin.h
@@ -24,117 +24,110 @@
                  __target__("avx512vbmi,avx512vl"),                            \
                  __min_vector_width__(256)))
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
-{
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
   return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
                                                  (__v16qi)__I,
                                                  (__v16qi)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
-                           __m128i __B)
-{
+                           __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
                                                  (__v32qi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
-                              __m256i __B)
-{
+                              __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                         (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                         (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A,
+                          __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                         (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                         (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
-        __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                      (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                      (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-             __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A,
+                             __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                      (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                      (__v32qi)__W);
@@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
                                 (__v32qi)_mm256_setzero_si256());
 }
 
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 263a1079b26d5..575c0c8962662 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) {
   return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                  (__v8hi) __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
-                             __m128i __B)
-{
+                             __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
-            __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I,
+                             __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) {
   return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                  (__v16hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
-                                __m256i __B)
-{
+                                __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
-                                 __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I,
+                                __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 5b2b3f0d0bbd4..885231b030b23 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
                                               (__v16hf)__A);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
   return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                  (__v8hi)__B);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
   return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                  (__v16hi)__B);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutexvar_ph(__m128i __A, __m128h __B) {
   return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutexvar_ph(__m256i __A, __m256h __B) {
   return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
 }
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 92bb444aeb5b8..e5249926b934e 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                                (__v8sf)_mm256_setzero_ps());
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
     return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
                                                   (__v4si)__B);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
                               __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)__A);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)__I);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)_mm_setzero_si128());
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
     return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
                                                   (__v8si) __B);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
                                  __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)__A);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)__I);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)_mm256_setzero_si256());
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
     return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
                                                    (__v2df)__B);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I,
+                           __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)__A);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U,
+                            __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)(__m128d)__I);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I,
+                            __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)_mm_setzero_pd());
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
     return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
                                                    (__v4df)__B);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
                               __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)__A);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
                                __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)(__m256d)__I);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
                                __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)_mm256_setzero_pd());
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
                                                   (__v4sf)__B);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)__A);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)(__m128)__I);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)_mm_setzero_ps());
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
     return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
                                                   (__v8sf) __B);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I,
+                              __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
                                     (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
                                     (__v8sf)__A);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
                                __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
@@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v8sf)(__m256)__I);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
                                __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
@@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v8sf)_mm256_setzero_ps());
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
     return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
                                                   (__v2di)__B);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
                               __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)__A);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)__I);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)_mm_setzero_si128());
   }
 
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
     return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
                                                   (__v4di) __B);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
                                  __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
@@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v4di)__A);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
@@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v4di)__I);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index be2cd480f7558..e6e2e38bcc097 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1591,6 +1591,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i
   return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 400, 10, 410, 20, 420, 30, 430,
+    40, 440, 50, 450, 60, 460, 70, 470,
+    80, 480, 90, 490, 100, 500, 110, 510,
+    120, 520, 130, 530, 140, 540, 150, 550));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16,
+            -17, -18, -19, -20, -21, -22, -23, -24,
+            -25, -26, -27, -28, -29, -30, -31, -32},
+        0xAAAAAAAA,
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    -1, 400, -3, 410, -5, 420, -7, 430,
+    -9, 440, -11, 450, -13, 460, -15, 470,
+    -17, 480, -19, 490, -21, 500, -23, 510,
+    -25, 520, -27, 530, -29, 540, -31, 550));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_maskz_permutex2var_epi16(
+        0x55555555,
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 0, 10, 0, 20, 0, 30, 0,
+    40, 0, 50, 0, 60, 0, 70, 0,
+    80, 0, 90, 0, 100, 0, 110, 0,
+    120, 0, 130, 0, 140, 0, 150, 0));
+
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_permutex2var_epi8(
+        (__m512i)(__v64qu){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 127, 126, 125,
+            124, 123, 122, 121, 120, 119, 118, 117,
+            116, 115, 114, 113, 112, 111, 110, 109,
+            108, 107, 106, 105, 104, 103, 102, 101,
+            100, 99, 98, 97, 96, 95, 94, 93,
+            92, 91, 90, 89, 88, 87, 86, 85,
+            84, 83, 82, 81, 80, 79, 78, 77},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 210, 220, 230, 240, 250, 254, 253,
+            252, 251, 250, 249, 248, 247, 246, 245,
+            244, 243, 242, 241, 240, 239, 238, 237,
+            236, 235, 234, 233, 232, 231, 230, 229,
+            228, 227, 226, 225, 224, 223, 222, 221,
+            220, 219, 218, 217, 216, 215, 214, 213,
+            212, 211, 210, 209, 208, 207, 206, 205,
+            204, 203, 202, 201, 200, 199, 198, 197}),
+    0, 200, 10, 210, 20, 220, 30, 230,
+    40, 240, 50, 250, 60, 254, 70, 253,
+    80, 252, 90, 251, 100, 250, 110, 249,
+    120, 248, 127, 247, 126, 246, 125, 245,
+    124, 244, 123, 243, 122, 242, 121, 241,
+    120, 240, 119, 239, 118, 238, 117, 237,
+    116, 236, 115, 235, 114, 234, 113, 233,
+    112, 232, 111, 231, 110, 230, 109, 229));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask2_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        0x55555555,
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 32, 10, 33, 20, 34, 30, 35,
+    40, 36, 50, 37, 60, 38, 70, 39,
+    80, 40, 90, 41, 100, 42, 110, 43,
+    120, 44, 130, 45, 140, 46, 150, 47));
+
 __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
@@ -2578,6 +2704,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) {
   return _mm512_broadcastw_epi16(__A);
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+                              (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
+                                                  3, 35, 4, 36, 5, 37, 6, 38,
+                                                  7, 39, 8, 40, 9, 41, 10, 42,
+                                                  11, 43, 12, 44, 13, 45, 14, 46},
+                              (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+                                                  117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
+    1, 32, 101, 132, 2, 102, 3, 103,
+    4, 104, 5, 105, 6, 106, 7, 107,
+    8, 108, 9, 109, 10, 110, 11, 111,
+    12, 112, 13, 113, 14, 114, 15, 115));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+                                                        -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32},
+                                    0xAAAAAAAA,
+                                    (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
+                                                        3, 35, 4, 36, 5, 37, 6, 38,
+                                                        7, 39, 8, 40, 9, 41, 10, 42,
+                                                        11, 43, 12, 44, 13, 45, 14, 46},
+                                    (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+                                                        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
+    -1, -32, -3, 132, -5, 102, -7, 103,
+    -9, 104, -11, 105, -13, 106, -15, 107,
+    -17, 108, -19, 109, -21, 110, -23, 111,
+    -25, 112, -27, 113, -29, 114, -31, 115));
 
 __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastw_epi16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 69599379b6b3d..8e65430bd3e84 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
 }
+
+TEST_CONSTEXPR(match_v16si(
+    _mm512_permutex2var_epi32(
+        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 150, 200, 350, 10, 210, 20, 220,
+    30, 230, 40, 240, 50, 250, 60, 260));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_mask_permutex2var_epi32(
+        (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8,
+                           -9, -10, -11, -12, -13, -14, -15, -16},
+        0xAAAA,
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    -1, -16, -3, 350, -5, 210, -7, 220,
+    -9, 230, -11, 240, -13, 250, -15, 260));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_maskz_permutex2var_epi32(
+        0x5555,
+        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 0, 200, 0, 10, 0, 20, 0,
+    30, 0, 40, 0, 50, 0, 60, 0));
+TEST_CONSTEXPR(match_m512(
+    _mm512_permutex2var_ps(
+        (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
+                 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
+    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_permutex2var_pd(
+        (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+        (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+        (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
+
 __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_testn_epi32_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
@@ -11753,3 +11803,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _
   // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
   _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2);
 }
+
+
+TEST_CONSTEXPR(match_m512d(
+    _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                           (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                           (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
+                                 0xAA,
+                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0));
+
+TEST_CONSTEXPR(match_m512(
+    _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
+                                     9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
+                           (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                               3, 19, 4, 20, 5, 21, 6, 22},
+                           (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                                    109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
+    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
+TEST_CONSTEXPR(match_m512(
+    _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f,
+                                          -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f},
+                                0xAAAA,
+                                (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                    3, 19, 4, 20, 5, 21, 6, 22},
+                                (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                                         109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f,
+    -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f));
+
+TEST_CONSTEXPR(match_v16si(
+    _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
+                                                  9, 10, 11, 12, 13, 14, 15, 16},
+                              (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                  3, 19, 4, 20, 5, 21, 6, 22},
+                              (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
+                                                  109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 16, 101, 116, 2, 102, 3, 103,
+    4, 104, 5, 105, 6, 106, 7, 107));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_maskz_permutex2var_epi32(0x5555,
+                                     (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
+                                                         9, 10, 11, 12, 13, 14, 15, 16},
+                                     (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                         3, 19, 4, 20, 5, 21, 6, 22},
+                                     (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
+                                                         109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 0, 101, 0, 2, 0, 3, 0,
+    4, 0, 5, 0, 6, 0, 7, 0));
+
+TEST_CONSTEXPR(match_v8di(
+    _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                              (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
+    1, 108, 1, 8, 2, 2, 3, 3));
+TEST_CONSTEXPR(match_v8di(
+    _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8},
+                                    0xAA,
+                                    (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                    (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
+    -1, 108, -3, -8, -5, -2, -7, -3));
diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
index c3b6298a39b59..7d506db92faeb 100644
--- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
@@ -3,8 +3,14 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8
@@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _
   return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
 }
 
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_permutex2var_epi8((__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 200, 1, 201, 2, 202, 3, 203,
+    4, 204, 5, 205, 6, 206, 7, 207,
+    8, 208, 9, 209, 10, 210, 11, 211,
+    12, 212, 13, 213, 14, 214, 15, 215,
+    16, 216, 17, 217, 18, 218, 19, 219,
+    20, 220, 21, 221, 22, 222, 23, 223,
+    24, 224, 25, 225, 26, 226, 27, 227,
+    28, 228, 29, 229, 30, 230, 31, 231));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){
+            10, 11, 12, 13, 14, 15, 16, 17,
+            18, 19, 20, 21, 22, 23, 24, 25,
+            26, 27, 28, 29, 30, 31, 32, 33,
+            34, 35, 36, 37, 38, 39, 40, 41,
+            42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 57,
+            58, 59, 60, 61, 62, 63, 64, 65,
+            66, 67, 68, 69, 70, 71, 72, 73},
+        0xAAAAAAAAAAAAAAAAULL,
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    10, 200, 12, 201, 14, 202, 16, 203,
+    18, 204, 20, 205, 22, 206, 24, 207,
+    26, 208, 28, 209, 30, 210, 32, 211,
+    34, 212, 36, 213, 38, 214, 40, 215,
+    42, 216, 44, 217, 46, 218, 48, 219,
+    50, 220, 52, 221, 54, 222, 56, 223,
+    58, 224, 60, 225, 62, 226, 64, 227,
+    66, 228, 68, 229, 70, 230, 72, 231));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL,
+        (__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 0, 1, 0, 2, 0, 3, 0,
+    4, 0, 5, 0, 6, 0, 7, 0,
+    8, 0, 9, 0, 10, 0, 11, 0,
+    12, 0, 13, 0, 14, 0, 15, 0,
+    16, 0, 17, 0, 18, 0, 19, 0,
+    20, 0, 21, 0, 22, 0, 23, 0,
+    24, 0, 25, 0, 26, 0, 27, 0,
+    28, 0, 29, 0, 30, 0, 31, 0));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        0x5555555555555555ULL,
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 64, 1, 65, 2, 66, 3, 67,
+    4, 68, 5, 69, 6, 70, 7, 71,
+    8, 72, 9, 73, 10, 74, 11, 75,
+    12, 76, 13, 77, 14, 78, 15, 79,
+    16, 80, 17, 81, 18, 82, 19, 83,
+    20, 84, 21, 85, 22, 86, 23, 87,
+    24, 88, 25, 89, 26, 90, 27, 91,
+    28, 92, 29, 93, 30, 94, 31, 95));
+
 __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_permutexvar_epi8
   // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
index c4d5fc8fb6977..49b7a1a721195 100644
--- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
+++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
@@ -3,8 +3,14 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_permutexvar_epi8
@@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi8
   // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
-  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
-}
+  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B);
+}
+
+TEST_CONSTEXPR(match_v16qu(
+    _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8,
+                                             9, 10, 11, 12, 13, 14, 15, 16},
+                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                             4, 20, 5, 21, 6, 22, 7, 23},
+                         (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                            109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 101, 2, 102, 3, 103, 4, 104,
+    5, 105, 6, 106, 7, 107, 8, 108));
+TEST_CONSTEXPR(match_v16qu(
+    _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207,
+                                                   208, 209, 210, 211, 212, 213, 214, 215},
+                               0xAAAA,
+                               (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                                   4, 20, 5, 21, 6, 22, 7, 23},
+                               (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                  109, 110, 111, 112, 113, 114, 115, 116}),
+    200, 101, 202, 102, 204, 103, 206, 104,
+    208, 105, 210, 106, 212, 107, 214, 108));
 
 __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi8
@@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8
   // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
-  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
-}
+  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B);
+}
+
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8,
+                                                 9, 10, 11, 12, 13, 14, 15, 16,
+                                                 17, 18, 19, 20, 21, 22, 23, 24,
+                                                 25, 26, 27, 28, 29, 30, 31, 32},
+                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
+                                                 4, 36, 5, 37, 6, 38, 7, 39,
+                                                 8, 40, 9, 41, 10, 42, 11, 43,
+                                                 12, 44, 13, 45, 14, 46, 15, 47},
+                             (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                109, 110, 111, 112, 113, 114, 115, 116,
+                                                117, 118, 119, 120, 121, 122, 123, 124,
+                                                125, 126, 127, 128, 129, 130, 131, 132}),
+    1, 101, 2, 102, 3, 103, 4, 104,
+    5, 105, 6, 106, 7, 107, 8, 108,
+    9, 109, 10, 110, 11, 111, 12, 112,
+    13, 113, 14, 114, 15, 115, 16, 116));
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207,
+                                                      208, 209, 210, 211, 212, 213, 214, 215,
+                                                      216, 217, 218, 219, 220, 221, 222, 223,
+                                                      224, 225, 226, 227, 228, 229, 230, 231},
+                                  0xAAAAAAAA,
+                                  (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
+                                                      4, 36, 5, 37, 6, 38, 7, 39,
+                                                      8, 40, 9, 41, 10, 42, 11, 43,
+                                                      12, 44, 13, 45, 14, 46, 15, 47},
+                                  (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                     109, 110, 111, 112, 113, 114, 115, 116,
+                                                     117, 118, 119, 120, 121, 122, 123, 124,
+                                                     125, 126, 127, 128, 129, 130, 131, 132}),
+    200, 101, 202, 102, 204, 103, 206, 104,
+    208, 105, 210, 106, 212, 107, 214, 108,
+    216, 109, 218, 110, 220, 111, 222, 112,
+    224, 113, 226, 114, 228, 115, 230, 116));
 
 __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 33c43977f72dc..121d5bf8d4adb 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                                 (__m128i)(__v4si){0, 3, 4, 6}, 0x05,
+                                 (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 3, 100, 6));
 __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                    0xA5,
+                                    (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 7, 100, 15, 1, 110, 2, 120));
 __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
   // CHECK-LABEL: test_mm_mask2_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
@@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20},
+                                 (__m128i)(__v2di){0, 5}, 0x1,
+                                 (__m128i)(__v2di){100, 200}),
+    10, 5));
 __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30},
+                                    (__m256i)(__v4di){0, 1, 4, 5}, 0x5,
+                                    (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 1, 100, 5));
 __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   return _mm_permutex2var_epi32(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                           (__m128i)(__v4si){0, 3, 4, 6},
+                           (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 40, 100, 300));
 __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
+                                (__m128i)(__v4si){0, 3, 4, 6},
+                                (__m128i)(__v4si){100, 200, 300, 400}),
+    -1, -4, -3, 300));
 __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,  __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40},
+                                 (__m128i)(__v4si){0, 3, 4, 6},
+                                 (__m128i)(__v4si){100, 200, 300, 400}),
+    0, 40, 0, 300));
 __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   return _mm256_permutex2var_epi32(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 70, 100, 170, 10, 110, 20, 120));
 __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA,
+                                   (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                   (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    -1, -8, -3, 170, -5, 110, -7, 120));
 __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                                  (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                  (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 70, 0, 170, 0, 110, 0, 120));
 __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   return _mm_permutex2var_pd(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    1.0, 10.0));
 __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    -1.0, 10.0));
 __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    0.0, 10.0));
 __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
   // CHECK-LABEL: test_mm256_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   return _mm256_permutex2var_pd(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    1.0, 10.0, 2.0, 20.0));
 __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    -1.0, 10.0, -3.0, -4.0));
 __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,  __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    0.0, 10.0, 0.0, 0.0));
 __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   return _mm_permutex2var_ps(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    1.f, 4.f, 10.f, 30.f));
 __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    -1.f, -4.f, -3.f, 30.f));
 __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    0.f, 4.f, 0.f, 30.f));
 __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   return _mm256_permutex2var_ps(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f));
 __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f));
 __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f));
 __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   return _mm_permutex2var_epi64(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    10, 200));
 __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    -1, 200));
 __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    0, 200));
 __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   return _mm256_permutex2var_epi64(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 10, 100, 110));
 __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    -1, -2, 100, -4));
 __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 0, 100, 0));
 
+TEST_CONSTEXPR(match_v4si(
+    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                           (__m128i)(__v4si){0, 3, 4, 6},
+                           (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 40, 100, 300));
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
+                                (__m128i)(__v4si){0, 3, 4, 6},
+                                (__m128i)(__v4si){100, 200, 300, 400}),
+    -1, -4, -3, 300));
 __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi8_epi32
   // CHECK: sext <4 x i8> %{{.*}} to <4 x i32>
@@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0));
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4));
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4));
+TEST_CONSTEXPR(match_v8si(
+    _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
+    1, 8, 101, 108, 2, 102, 3, 103));
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8},
+                                    0xAA,
+                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                    (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
+    -1, -8, -3, 108, -5, 102, -7, 103));
 
 __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_mov_pd
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index febef46458ae9..172a3cb219c8a 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1887,6 +1887,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
+
+TEST_CONSTEXPR(match_v8hi(
+    _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                           (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                           (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160,
+                                             170}),
+    0, 70, 100, 170, 10, 110, 20, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8},
+                                0xAA,
+                                (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                  160, 170}),
+    -1, -8, -3, 170, -5, 110, -7, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_maskz_permutex2var_epi16(0xAA,
+                                 (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                   160, 170}),
+    0, 70, 0, 170, 0, 110, 0, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                 0x55,
+                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                   160, 170}),
+    0, 7, 100, 15, 10, 9, 20, 10));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_permutex2var_epi16(
+        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 150, 200, 350, 10, 210, 20, 220,
+    30, 230, 40, 240, 50, 250, 60, 260));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_mask_permutex2var_epi16(
+        (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8,
+                           -9, -10, -11, -12, -13, -14, -15, -16},
+        0xAAAA,
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    -1, -16, -3, 350, -5, 210, -7, 220,
+    -9, 230, -11, 240, -13, 250, -15, 260));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_maskz_permutex2var_epi16(
+        0x5555,
+        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 0, 200, 0, 10, 0, 20, 0,
+    30, 0, 40, 0, 50, 0, 60, 0));
+
 __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_maddubs_epi16
   // CHECK: @llvm.x86.ssse3.pmadd.ub.sw
@@ -3596,3 +3657,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _
  // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256
  _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A);
 }
+
+
+TEST_CONSTEXPR(match_v16qu(
+    _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70,
+                                             80, 90, 100, 110, 120, 127, 126, 125},
+                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                             4, 20, 5, 21, 6, 22, 7, 23},
+                         (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170,
+                                            180, 190, 200, 210, 220, 230, 240, 250}),
+    0, 100, 10, 110, 20, 120, 30, 130,
+    40, 140, 50, 150, 60, 160, 70, 170));
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109},
+                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47},
+                             (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}),
+    0, 200, 10, 201, 20, 202, 30, 203,
+    40, 204, 50, 205, 60, 206, 70, 207,
+    80, 208, 90, 209, 100, 210, 110, 211,
+    120, 212, 127, 213, 126, 214, 125, 215));

From 35ee3c6f72ba5aa26299d693f866385f23e4d330 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Wed, 5 Nov 2025 15:38:24 +0530
Subject: [PATCH 294/313] [MLIR][NVVM] Update mbarrier Ops to use AnyTypeOf[]
 (2/n) (#165993)

This is a follow up of PR #165558. (1/n)

This patch updates the below mbarrier Ops to use
AnyTypeOf[] construct:
```
* mbarrier.arrive
* mbarrier.arrive.noComplete
* mbarrier.test.wait
* cp.async.mbarrier.arrive
```

* Updated existing tests accordingly.
* Verified locally that there are no new regressions in the `integration` tests.
* TODO: Two more Ops remain and will be migrated in a subsequent PR.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 .../Optimizer/Builder/CUDAIntrinsicCall.cpp   |   3 +-
 flang/test/Lower/CUDA/cuda-device-proc.cuf    |   2 +-
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   | 142 +++++++-----------
 .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp    |  26 +---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    |  82 ++++++++--
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |  10 +-
 .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir   |   8 +-
 mlir/test/Dialect/LLVMIR/nvvm.mlir            |  12 +-
 mlir/test/Target/LLVMIR/nvvmir.mlir           |   4 +-
 9 files changed, 152 insertions(+), 137 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index 4e276a72897fe..6312e61f5e62a 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -891,8 +891,7 @@ CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
   assert(args.size() == 1);
   mlir::Value barrier = convertPtrToNVVMSpace(
       builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
-                                                    barrier)
+  return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
       .getResult();
 }
 
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 038aa0a058277..2d2c801b48f4d 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -440,7 +440,7 @@ end subroutine
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
+! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 5f87e5c07e56e..10f0cc254ea97 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -658,8 +658,8 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
 }
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr)> {
+  Results<(outs I64:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
   let summary = "MBarrier Arrive Operation";
   let description = [{
     The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the 
@@ -676,36 +676,32 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
     value are implementation-specific.
 
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr});
-  }];
   let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
-}
 
-def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr)> {
-  let summary = "Shared MBarrier Arrive Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
-
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr});
+    auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
+  Results<(outs I64:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I32:$count)> {
   let summary = "MBarrier Arrive No-Complete Operation";
   let description = [{
     The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation 
@@ -723,33 +719,29 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
     captures the phase of the *mbarrier object* prior to the arrive-on operation.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
     - `count`: Integer specifying the count argument to the arrive-on operation. 
       Must be in the valid range as specified in the *mbarrier object* contents.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count});
-  }];
-  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
-}
 
-def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
-  let summary = "Shared MBarrier Arrive No-Complete Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count});
+    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
@@ -898,8 +890,9 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
 }
 
 def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> {
+  Results<(outs I1:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I64:$state)> {
   let summary = "MBarrier Non-Blocking Test Wait Operation";
   let description = [{
     The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the 
@@ -946,26 +939,20 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
-  }];
-  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
-}
 
-def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
-  let summary = "Shared MBarrier Non-Blocking Test Wait Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
+    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1541,47 +1528,30 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
     The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
     all prior cp.async operations initiated by the executing thread.
     The `addr` operand specifies the address of the *mbarrier object*
-    in generic address space. The `noinc` attr impacts how the
-    mbarrier's state is updated.
+    in generic or shared::cta address space. When it is generic, the
+    underlying memory should fall within the shared::cta space;
+    otherwise the behavior is undefined. The `noinc` attr impacts
+    how the mbarrier's state is updated.
     
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
   }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
   let arguments = (ins
-    LLVM_AnyPointer:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
-
-  string llvmBuilder = [{
-    auto intId = $noinc ?
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc :
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
-
-    createIntrinsicCall(builder, intId, {$addr});
-  }];
-}
+    AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+    DefaultValuedAttr<I1Attr, "0">:$noinc);
 
-def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> {
-  let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared";
-  let description = [{
-    The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
-    track all prior cp.async operations initiated by the executing thread.
-    The `addr` operand specifies the address of the *mbarrier object* in
-    shared memory. The `noinc` attr impacts how the mbarrier's state
-    is updated. 
-    
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
-  }];
   let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
-  let arguments = (ins
-    LLVM_PointerShared:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
+  }];
 
   string llvmBuilder = [{
-    auto intId = $noinc ?
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared :
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared;
-
-    createIntrinsicCall(builder, intId, {$addr});
+    auto [id, args] = NVVM::CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
 }
 
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index ec182f1db48ac..9348d3c172a07 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -865,13 +865,7 @@ struct NVGPUMBarrierArriveLowering
                        adaptor.getMbarId(), rewriter);
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
-                                                                barrier);
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
-                                                          barrier);
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, barrier);
     return success();
   }
 };
@@ -892,13 +886,8 @@ struct NVGPUMBarrierArriveNoCompleteLowering
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
     Value count = truncToI32(b, adaptor.getCount());
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
-          op, tokenType, barrier, count);
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
-          op, tokenType, barrier, count);
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
+        op, tokenType, barrier, count);
     return success();
   }
 };
@@ -915,13 +904,8 @@ struct NVGPUMBarrierTestWaitLowering
         getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Type retType = rewriter.getI1Type();
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
-          op, retType, barrier, adaptor.getToken());
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
-          op, retType, barrier, adaptor.getToken());
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(op, retType, barrier,
+                                                          adaptor.getToken());
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 262d9b753a2d7..d43f8815be16d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1752,15 +1752,21 @@ std::string NVVM::MBarrierInitOp::getPtx() {
 // getIntrinsicID/getIntrinsicIDAndArgs methods
 //===----------------------------------------------------------------------===//
 
+static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) {
+  auto ptrTy = llvm::cast<LLVM::LLVMPointerType>(ptr.getType());
+  return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS);
+}
+
+static bool isPtrInSharedCTASpace(mlir::Value ptr) {
+  return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared);
+}
+
 mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::MBarrierInitOp>(op);
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
-          .getAddressSpace();
-  llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
-                               ? llvm::Intrinsic::nvvm_mbarrier_init_shared
-                               : llvm::Intrinsic::nvvm_mbarrier_init;
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_init_shared
+                                    : llvm::Intrinsic::nvvm_mbarrier_init;
 
   // Fill the Intrinsic Args
   llvm::SmallVector<llvm::Value *> args;
@@ -1773,16 +1779,72 @@ mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
 mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
-          .getAddressSpace();
-  llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
                                ? llvm::Intrinsic::nvvm_mbarrier_inval_shared
                                : llvm::Intrinsic::nvvm_mbarrier_inval;
 
   return {id, {mt.lookupValue(thisOp.getAddr())}};
 }
 
+mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierArriveOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_arrive_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_arrive;
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
+mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id =
+      isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared
+               : llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete;
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getCount()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_test_wait_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_test_wait;
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getState()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::CpAsyncMBarrierArriveOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+
+  llvm::Intrinsic::ID id;
+  if (thisOp.getNoinc()) {
+    id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared
+                  : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc;
+  } else {
+    id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared
+                  : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
+  }
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
 #define CP_ASYNC_ID_IMPL(mod, size, suffix)                                    \
   llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix
 
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 8cce6308018e2..dcf4ddb2dd48c 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -491,12 +491,12 @@ func.func @mbarrier() {
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]]
+  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive %[[barPtr2]]
   %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType
 
   // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+  // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]]
   %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
 
   func.return
@@ -521,12 +521,12 @@ func.func @mbarrier_nocomplete() {
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]]
+  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete %[[barPtr2]]
   %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType
 
   // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+  // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]]
   %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
 
   func.return
@@ -572,7 +572,7 @@ func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.ad
 // CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64
 // CHECK: %[[S4:.+]] = llvm.extractvalue %[[CARG0]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-// CHECK: nvvm.mbarrier.test.wait.shared {{.*}}, %[[CARG1]]
+// CHECK: nvvm.mbarrier.test.wait {{.*}}, %[[CARG1]]
     %mbarId = arith.remui %i, %numBarriers : index
     %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType
   }
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index fbc4c0af60360..a9356c5cb60bb 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -88,10 +88,10 @@ func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p
   nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
   // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
   nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}}
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 2505e56407c2b..cd7bd37da5763 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -445,8 +445,8 @@ llvm.func private @mbarrier_arrive(%barrier: !llvm.ptr) {
 }
 
 llvm.func private @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) {
-  // CHECK:   nvvm.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3>
-  %0 = nvvm.mbarrier.arrive.shared %barrier : !llvm.ptr<3> -> i64
+  // CHECK:   nvvm.mbarrier.arrive %{{.*}} : !llvm.ptr<3>
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64
   llvm.return
 }
 
@@ -459,8 +459,8 @@ llvm.func private @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) {
 
 llvm.func private @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.arrive.nocomplete.shared %{{.*}} : !llvm.ptr<3>
-  %0 = nvvm.mbarrier.arrive.nocomplete.shared %barrier, %count : !llvm.ptr<3>, i32  -> i64
+  // CHECK:   nvvm.mbarrier.arrive.nocomplete %{{.*}} : !llvm.ptr<3>
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32  -> i64
   llvm.return
 }
 
@@ -472,8 +472,8 @@ llvm.func private @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 {
 
 llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.test.wait.shared %{{.*}}
-  %isComplete = nvvm.mbarrier.test.wait.shared %barrier, %token : !llvm.ptr<3>, i64 -> i1
+  // CHECK:   nvvm.mbarrier.test.wait %{{.*}}
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 9115de65ff0e8..3fc09f371a347 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -538,9 +538,9 @@ llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p
   // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}})
   nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
   // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
   // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
   llvm.return
 }
 

From 833983918d2f401886cee74174850b987eaf80b5 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Wed, 5 Nov 2025 11:11:08 +0100
Subject: [PATCH 295/313] Revert "CodeGen: Record MMOs in finalizeBundle"
 (#166520)

Reverts llvm/llvm-project#166210

Buildbot failures in the libc on GPU bot:
https://lab.llvm.org/buildbot/#/builders/10/builds/16711
---
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       |    2 -
 llvm/lib/CodeGen/MachineInstrBundle.cpp       |    6 -
 .../GlobalISel/insertelement-stack-lower.ll   |    2 +-
 .../AMDGPU/GlobalISel/store-local.128.ll      |   20 +-
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   |    7 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 9954 +++++++++--------
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |  363 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   | 1503 +--
 .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll   |  295 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll   |  357 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll   |  370 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   |  534 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   |  839 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   |  756 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll   | 1361 +--
 llvm/test/CodeGen/AMDGPU/bf16.ll              |   28 +-
 .../AMDGPU/buffer-fat-pointers-memcpy.ll      |   49 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |   12 +-
 llvm/test/CodeGen/AMDGPU/ds_write2.ll         |    4 +-
 llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll     |    4 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |   63 +-
 llvm/test/CodeGen/AMDGPU/finalizebundle.mir   |   52 -
 .../AMDGPU/gfx-callable-return-types.ll       |  153 +-
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  |    4 +-
 .../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir |    4 +-
 .../CodeGen/AMDGPU/hard-clauses-img-gfx12.mir |    4 +-
 ...llvm.amdgcn.ds.gws.barrier-fastregalloc.ll |   21 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |    2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll     |    4 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll       |    8 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       |  156 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |   18 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |    6 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |   10 +-
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    |   15 +-
 llvm/test/CodeGen/AMDGPU/load-local-i16.ll    |  214 +-
 ...er-buffer-fat-pointers-lastuse-metadata.ll |   31 +-
 ...uffer-fat-pointers-nontemporal-metadata.ll |   58 +-
 .../AMDGPU/lower-lds-struct-aa-memcpy.ll      |    4 +-
 .../CodeGen/AMDGPU/lower-lds-struct-aa.ll     |    8 +-
 llvm/test/CodeGen/AMDGPU/max.ll               |    2 +-
 .../CodeGen/AMDGPU/memintrinsic-unroll.ll     | 1971 ++--
 llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll  |    6 +-
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       |  203 +-
 .../CodeGen/AMDGPU/postra-bundle-memops.mir   |    5 +-
 .../postra-bundle-vimage-vsample-gfx12.mir    |    4 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |   41 +-
 llvm/test/CodeGen/AMDGPU/scratch-simple.ll    | 2380 ++--
 .../soft-clause-exceeds-register-budget.ll    |   15 +-
 llvm/test/CodeGen/AMDGPU/spill-agpr.ll        |    6 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   |   22 +-
 llvm/test/CodeGen/AMDGPU/stack-realign.ll     |    2 +-
 .../Thumb2/mve-vpt-block-fold-vcmp.mir        |   45 +-
 53 files changed, 11293 insertions(+), 10710 deletions(-)

diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 434a579c3be3f..4795d81e3f348 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -1161,8 +1161,6 @@ bool MIParser::parse(MachineInstr *&MI) {
       MemOperands.push_back(MemOp);
       if (Token.isNewlineOrEOF())
         break;
-      if (OpCode == TargetOpcode::BUNDLE && Token.is(MIToken::lbrace))
-        break;
       if (Token.isNot(MIToken::comma))
         return error("expected ',' before the next machine memory operand");
       lex();
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index a8dc614288f20..88d81993fbe55 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -137,7 +137,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
   SmallVector<std::pair<Register, Register>> TiedOperands;
-  SmallVector<MachineInstr *> MemMIs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -201,9 +200,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       MIB.setMIFlag(MachineInstr::FrameSetup);
     if (MII->getFlag(MachineInstr::FrameDestroy))
       MIB.setMIFlag(MachineInstr::FrameDestroy);
-
-    if (MII->mayLoadOrStore())
-      MemMIs.push_back(&*MII);
   }
 
   for (Register Reg : LocalDefs) {
@@ -229,8 +225,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     assert(UseIdx < ExternUses.size());
     MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
-
-  MIB->cloneMergedMemRefs(MF, MemMIs);
 }
 
 /// finalizeBundle - Same functionality as the previous finalizeBundle except
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 6076a2eec44bc..c2129c20e4543 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -33,6 +33,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v13, s49
 ; GCN-NEXT:    v_mov_b32_e32 v14, s50
 ; GCN-NEXT:    v_mov_b32_e32 v15, s51
+; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:4
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
@@ -50,7 +51,6 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:56
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:60
 ; GCN-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0xc0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
 ; GCN-NEXT:    v_mov_b32_e32 v0, s53
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
index 10e83b70a57d4..1812e17800e71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll
@@ -189,11 +189,15 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s5, s5, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s6
+; GFX10-NEXT:    v_mov_b32_e32 v7, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX10-NEXT:    ds_write_b8 v1, v0
@@ -204,22 +208,18 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
 ; GFX10-NEXT:    ds_write_b8 v1, v8 offset:1
 ; GFX10-NEXT:    ds_write_b8 v1, v9 offset:5
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX10-NEXT:    s_and_b32 s9, 0xffff, s2
-; GFX10-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX10-NEXT:    v_mov_b32_e32 v7, s1
-; GFX10-NEXT:    s_lshr_b32 s1, s9, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10-NEXT:    v_mov_b32_e32 v10, s1
+; GFX10-NEXT:    s_lshr_b32 s0, s2, 24
+; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
+; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
+; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
 ; GFX10-NEXT:    ds_write_b8 v1, v0 offset:10
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s3
-; GFX10-NEXT:    v_mov_b32_e32 v10, s1
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s1, s3, 16
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s3
-; GFX10-NEXT:    ds_write_b8 v1, v7 offset:7
-; GFX10-NEXT:    ds_write_b8 v1, v3 offset:8
-; GFX10-NEXT:    ds_write_b8 v1, v10 offset:9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    s_lshr_b32 s0, s3, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 4a22a911c60b7..b33b8a7d8cd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -272,6 +272,10 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
 ; GFX906-NEXT:    buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    global_load_dwordx4 v[17:20], v4, s[0:1] offset:64
 ; GFX906-NEXT:    global_load_dwordx4 v[21:24], v4, s[0:1] offset:80
 ; GFX906-NEXT:    global_load_dwordx4 v[25:28], v4, s[0:1] offset:96
@@ -284,9 +288,6 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    global_load_dwordx4 v[53:56], v4, s[0:1] offset:208
 ; GFX906-NEXT:    global_load_dwordx4 v[57:60], v4, s[0:1] offset:224
 ; GFX906-NEXT:    global_load_dwordx4 v[0:3], v4, s[0:1] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[5:8], v4, s[0:1] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[9:12], v4, s[0:1] offset:32
-; GFX906-NEXT:    global_load_dwordx4 v[13:16], v4, s[0:1] offset:48
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 746ffcff5667a..74552a500ac51 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3105,6 +3105,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -3237,22 +3253,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -3284,13 +3284,14 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB12_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -3522,6 +3523,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB12_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -3944,24 +3946,8 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -4309,12 +4295,44 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v32i32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -4419,22 +4437,6 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -4540,129 +4542,129 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -5111,10 +5113,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -5254,8 +5255,15 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
 ; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -5272,23 +5280,12 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v32i32_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -5305,6 +5302,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -5437,6 +5437,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -5492,7 +5493,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(45)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -5507,7 +5508,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(47)
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -5519,147 +5520,149 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -5667,9 +5670,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -5697,7 +5698,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v32, 3, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(44)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_add_u32_e32 v31, 3, v31
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; GFX9-NEXT:    v_add_u32_e32 v30, 3, v30
@@ -6005,25 +6006,9 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -6295,6 +6280,22 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6754,11 +6755,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -6779,6 +6776,10 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -7415,7 +7416,7 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -10665,7 +10666,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
@@ -11598,7 +11599,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-NEXT:    v_readlane_b32 s34, v16, 2
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v16, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:8
@@ -11811,26 +11812,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -11991,30 +11979,44 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB14_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -12023,11 +12025,11 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -12630,6 +12632,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB14_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -12643,8 +12646,8 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -13324,25 +13327,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -13479,20 +13470,34 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -13978,6 +13983,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -14555,27 +14561,13 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -14717,20 +14709,34 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -15217,6 +15223,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -16355,7 +16362,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -16388,7 +16395,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -17329,7 +17336,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB14_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -17362,7 +17369,7 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -18079,13 +18086,24 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB15_3
 ; SI-NEXT:  .LBB15_2:
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -18096,22 +18114,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB15_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -18121,6 +18127,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -18715,13 +18722,13 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -18949,11 +18956,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -18963,8 +18970,11 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -18972,8 +18982,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -19182,6 +19190,12 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB15_3
 ; VI-NEXT:  .LBB15_2:
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -19199,12 +19213,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -19214,7 +19222,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -19227,6 +19235,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -19811,8 +19820,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -19991,18 +20000,16 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -20029,8 +20036,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -20046,16 +20054,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -20067,11 +20073,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20084,12 +20089,10 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -20103,22 +20106,17 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -20134,24 +20132,45 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -20202,6 +20221,18 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB15_3
 ; GFX9-NEXT:  .LBB15_2:
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -20215,18 +20246,6 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -20664,7 +20683,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -20697,7 +20716,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -21554,7 +21573,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB15_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -21587,7 +21606,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -21605,7 +21624,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -21638,7 +21657,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -22495,7 +22514,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB15_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -22528,7 +22547,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -23091,25 +23110,10 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -23288,6 +23292,22 @@ define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -26109,10 +26129,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -26129,6 +26146,9 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -26694,7 +26714,7 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -29161,7 +29181,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -29194,7 +29214,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -29227,7 +29247,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -30029,7 +30049,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -30062,7 +30082,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -30095,7 +30115,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -30135,7 +30155,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -30168,7 +30188,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -30201,7 +30221,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -30893,7 +30913,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -30926,7 +30946,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -30959,7 +30979,7 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -31768,22 +31788,6 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -31803,6 +31807,22 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -32473,25 +32493,10 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -32504,6 +32509,21 @@ define inreg <64 x half> @bitcast_v32i32_to_v64f16_scalar(<32 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -34712,7 +34732,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -34745,7 +34765,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -34778,7 +34798,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -34856,7 +34876,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -34889,7 +34909,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -34922,7 +34942,7 @@ define inreg <32 x i32> @bitcast_v64f16_to_v32i32_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -34980,10 +35000,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i32_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -35000,6 +35016,10 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -35031,13 +35051,14 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB24_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -35082,6 +35103,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB24_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 3, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
@@ -35308,22 +35330,6 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -35350,7 +35356,7 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -35363,6 +35369,22 @@ define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -36316,13 +36338,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -36354,6 +36370,12 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -36369,6 +36391,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36585,6 +36608,7 @@ define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -37758,7 +37782,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -37791,7 +37815,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -37824,7 +37848,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -37902,7 +37926,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -37935,7 +37959,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -37968,7 +37992,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -40009,6 +40033,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f32_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -40141,22 +40181,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -40188,13 +40212,14 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB36_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -40426,6 +40451,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB36_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -40848,24 +40874,8 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
@@ -41213,12 +41223,44 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v32f32_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -41323,22 +41365,6 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -41444,129 +41470,129 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -42015,10 +42041,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -42158,8 +42183,15 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
 ; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -42176,23 +42208,12 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v32f32_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -42209,6 +42230,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -42341,6 +42365,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -42396,7 +42421,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(45)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -42411,7 +42436,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(47)
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -42423,147 +42448,149 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -42571,9 +42598,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -42601,7 +42626,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_cbranch_execz .LBB36_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_f32_e32 v32, 1.0, v32
-; GFX9-NEXT:    s_waitcnt vmcnt(44)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_add_f32_e32 v31, 1.0, v31
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; GFX9-NEXT:    v_add_f32_e32 v30, 1.0, v30
@@ -42909,25 +42934,9 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -43199,6 +43208,22 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -43641,11 +43666,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -43666,6 +43687,10 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -44285,7 +44310,7 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -44745,11 +44770,27 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
+; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
@@ -44801,33 +44842,24 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v12
-; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v16
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v21
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v21
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v26
@@ -44836,8 +44868,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v26
 ; SI-NEXT:    v_add_f32_e64 v41, s21, 1.0
 ; SI-NEXT:    v_add_f32_e64 v40, s20, 1.0
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v26
@@ -44845,7 +44875,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_add_f32_e64 v57, s16, 1.0
 ; SI-NEXT:    v_add_f32_e64 v46, s19, 1.0
 ; SI-NEXT:    v_add_f32_e64 v45, s18, 1.0
-; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[40:41], 16
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -44856,8 +44885,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v30
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshr_b64 v[27:28], v[40:41], 24
 ; SI-NEXT:    v_lshr_b64 v[33:34], v[45:46], 24
 ; SI-NEXT:    v_lshr_b64 v[38:39], v[45:46], 8
@@ -45381,33 +45408,33 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v13, s98
-; SI-NEXT:    v_mov_b32_e32 v27, s62
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s46
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s56
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s58
+; SI-NEXT:    v_mov_b32_e32 v27, s62
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v13, s46
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s72
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v13, s56
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s74
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v13, s58
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s76
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v57, s16
@@ -45441,7 +45468,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v3, s6
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
 ; SI-NEXT:    v_readlane_b32 s5, v61, 1
-; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v13, s60
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v27, s78
@@ -45668,22 +45694,9 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v29
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
@@ -45796,16 +45809,17 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v15
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
 ; SI-NEXT:    v_or_b32_e32 v13, v14, v13
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v15, v14
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
@@ -46056,6 +46070,19 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -46660,10 +46687,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    ; implicit-def: $sgpr46
 ; VI-NEXT:    s_branch .LBB37_2
 ; VI-NEXT:  .LBB37_4:
-; VI-NEXT:    v_mov_b32_e32 v53, s46
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s56
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 0
 ; VI-NEXT:    v_mov_b32_e32 v48, s4
@@ -46741,9 +46764,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s58
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 27
@@ -46821,9 +46841,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 51
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s60
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
@@ -46842,6 +46859,40 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
+; VI-NEXT:    v_mov_b32_e32 v53, s46
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s56
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s58
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s60
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s62
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s72
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s74
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s76
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s78
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s88
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s90
+; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, s16
 ; VI-NEXT:    v_mov_b32_e32 v32, s17
 ; VI-NEXT:    v_mov_b32_e32 v29, s18
@@ -46895,35 +46946,11 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v42, s82
 ; VI-NEXT:    v_mov_b32_e32 v37, s81
 ; VI-NEXT:    v_mov_b32_e32 v50, s80
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v53, s30
+; VI-NEXT:    v_mov_b32_e32 v54, s34
 ; VI-NEXT:    v_mov_b32_e32 v39, s36
 ; VI-NEXT:    v_mov_b32_e32 v40, s38
 ; VI-NEXT:    v_mov_b32_e32 v41, s48
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s62
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s72
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s74
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s76
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s78
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s88
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s90
-; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v53, s30
-; VI-NEXT:    v_mov_b32_e32 v54, s34
 ; VI-NEXT:  .LBB37_5: ; %end
 ; VI-NEXT:    v_lshlrev_b32_e32 v34, 8, v34
 ; VI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
@@ -46991,20 +47018,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v23, 8, v50
 ; VI-NEXT:    v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v24, 8, v36
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    v_readlane_b32 s87, v63, 31
 ; VI-NEXT:    v_readlane_b32 s86, v63, 30
 ; VI-NEXT:    v_readlane_b32 s85, v63, 29
@@ -47037,7 +47050,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v24, vcc, 36, v0
@@ -47328,6 +47341,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
@@ -48096,8 +48123,10 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 49
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s4
+; GFX9-NEXT:    v_mov_b32_e32 v49, s52
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s46
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -48146,7 +48175,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v40, s94
-; GFX9-NEXT:    v_mov_b32_e32 v49, s52
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -48194,7 +48222,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v54, s55
 ; GFX9-NEXT:    v_mov_b32_e32 v50, s53
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s54
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v49, s51
 ; GFX9-NEXT:    v_mov_b32_e32 v59, s50
 ; GFX9-NEXT:    v_mov_b32_e32 v58, s49
@@ -48264,20 +48291,6 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v49
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s99, v63, 35
 ; GFX9-NEXT:    v_readlane_b32 s98, v63, 34
 ; GFX9-NEXT:    v_readlane_b32 s97, v63, 33
@@ -48314,7 +48327,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:20
@@ -48608,6 +48621,20 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
@@ -48619,7 +48646,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -48654,7 +48681,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x12
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -49574,7 +49601,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x12
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -49636,7 +49663,7 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -49849,26 +49876,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -50029,30 +50043,44 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB38_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -50061,11 +50089,11 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -50668,6 +50696,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB38_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -50681,8 +50710,8 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -51362,25 +51391,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -51517,20 +51534,34 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -52016,6 +52047,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -52593,27 +52625,13 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -52755,20 +52773,34 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -53255,6 +53287,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -54393,7 +54426,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -54426,7 +54459,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -55367,7 +55400,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB38_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -55400,7 +55433,7 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -56117,13 +56150,24 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB39_3
 ; SI-NEXT:  .LBB39_2:
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -56134,22 +56178,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB39_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -56159,6 +56191,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -56753,13 +56786,13 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -56987,11 +57020,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -57001,8 +57034,11 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -57010,8 +57046,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -57220,6 +57254,12 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB39_3
 ; VI-NEXT:  .LBB39_2:
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -57237,12 +57277,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -57252,7 +57286,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -57265,6 +57299,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -57849,8 +57884,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -58029,18 +58064,16 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -58067,8 +58100,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -58084,16 +58118,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -58105,11 +58137,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58122,12 +58153,10 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -58141,22 +58170,17 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -58172,24 +58196,45 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -58240,6 +58285,18 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB39_3
 ; GFX9-NEXT:  .LBB39_2:
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -58253,18 +58310,6 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -58702,7 +58747,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -58735,7 +58780,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -59592,7 +59637,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB39_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -59625,7 +59670,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -59643,7 +59688,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -59676,7 +59721,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -60533,7 +60578,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB39_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -60566,7 +60611,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -61129,25 +61174,10 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -61326,6 +61356,22 @@ define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -62094,20 +62140,6 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_readlane_b32 s98, v63, 34
 ; SI-NEXT:    v_readlane_b32 s97, v63, 33
@@ -62144,9 +62176,10 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_readlane_b32 s34, v63, 2
 ; SI-NEXT:    v_readlane_b32 s31, v63, 1
 ; SI-NEXT:    v_readlane_b32 s30, v63, 0
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
@@ -62160,6 +62193,20 @@ define inreg <64 x bfloat> @bitcast_v32f32_to_v64bf16_scalar(<32 x float> inreg
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -64192,10 +64239,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -64212,6 +64256,9 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -64777,7 +64824,7 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -67244,7 +67291,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -67277,7 +67324,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -67310,7 +67357,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -68112,7 +68159,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -68145,7 +68192,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -68178,7 +68225,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -68218,7 +68265,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -68251,7 +68298,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -68284,7 +68331,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -68976,7 +69023,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -69009,7 +69056,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -69042,7 +69089,7 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -69851,22 +69898,6 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -69886,6 +69917,22 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -70556,6 +70603,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -70572,11 +70624,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -72766,7 +72813,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -72799,7 +72846,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -72832,7 +72879,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -72910,7 +72957,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -72943,7 +72990,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -72976,7 +73023,7 @@ define inreg <32 x float> @bitcast_v64f16_to_v32f32_scalar(<64 x half> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -73034,10 +73081,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f32_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -73054,6 +73097,10 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -73085,13 +73132,14 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; kill: killed $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB48_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -73136,6 +73184,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    s_cbranch_execz .LBB48_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
@@ -73362,22 +73411,6 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -73404,7 +73437,7 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -73417,6 +73450,22 @@ define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -73902,25 +73951,9 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
@@ -73946,6 +73979,22 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -74324,13 +74373,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -74362,6 +74405,12 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -74377,6 +74426,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -74593,6 +74643,7 @@ define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -75766,7 +75817,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -75799,7 +75850,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -75832,7 +75883,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -75910,7 +75961,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -75943,7 +75994,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -75976,7 +76027,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -77003,6 +77054,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16i64_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
@@ -77135,22 +77202,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr45
 ; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr41
@@ -77182,13 +77233,14 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB56_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -77449,6 +77501,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_addc_u32_e32 v31, vcc, 0, v31, vcc
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 24
@@ -77842,24 +77895,8 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
@@ -78207,12 +78244,44 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v16i64_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -78317,22 +78386,6 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr59
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -78438,129 +78491,129 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; VI-NEXT:    v_mov_b32_e32 v55, v39
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v11
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v32
+; VI-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; VI-NEXT:    v_mov_b32_e32 v55, v39
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v27
 ; VI-NEXT:    v_lshrrev_b32_e32 v59, 24, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v60, 16, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v9
@@ -79009,10 +79062,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -79152,8 +79204,15 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v55
 ; VI-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -79170,23 +79229,12 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v16i64_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -79203,6 +79251,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
@@ -79335,6 +79386,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; kill: killed $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -79390,7 +79442,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(45)
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -79405,7 +79457,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(47)
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -79417,147 +79469,149 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v9
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[17:18]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[11:12]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[7:8]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
@@ -79565,9 +79619,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v11
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
@@ -79624,7 +79676,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v28, vcc, 0, v28, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v29, vcc, 3, v29
 ; GFX9-NEXT:    v_addc_co_u32_e32 v30, vcc, 0, v30, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(44)
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    v_add_co_u32_e32 v31, vcc, 3, v31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v32, vcc, 0, v32, vcc
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
@@ -79903,25 +79955,9 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
@@ -80193,6 +80229,22 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -80660,11 +80712,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -80685,6 +80733,10 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -81329,7 +81381,7 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -84579,7 +84631,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
@@ -85514,7 +85566,7 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s35, v16, 3
 ; GFX11-NEXT:    v_readlane_b32 s34, v16, 2
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v16, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:8
@@ -85727,26 +85779,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -85907,30 +85946,44 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -85939,11 +85992,11 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -86546,6 +86599,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB58_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -86559,8 +86613,8 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -87240,25 +87294,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -87395,20 +87437,34 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -87894,6 +87950,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -88471,27 +88528,13 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -88633,20 +88676,34 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -89133,6 +89190,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -90271,7 +90329,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -90304,7 +90362,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -91245,7 +91303,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB58_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -91278,7 +91336,7 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -91995,13 +92053,24 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB59_3
 ; SI-NEXT:  .LBB59_2:
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -92012,22 +92081,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB59_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -92037,6 +92094,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -92631,13 +92689,13 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -92865,11 +92923,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -92879,8 +92937,11 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -92888,8 +92949,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -93098,6 +93157,12 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB59_3
 ; VI-NEXT:  .LBB59_2:
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -93115,12 +93180,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -93130,7 +93189,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -93143,6 +93202,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -93727,8 +93787,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -93907,18 +93967,16 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -93945,8 +94003,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -93962,16 +94021,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -93983,11 +94040,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94000,12 +94056,10 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -94019,22 +94073,17 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -94050,24 +94099,45 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -94118,6 +94188,18 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB59_3
 ; GFX9-NEXT:  .LBB59_2:
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -94131,18 +94213,6 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -94580,7 +94650,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -94613,7 +94683,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -95470,7 +95540,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB59_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -95503,7 +95573,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -95521,7 +95591,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -95554,7 +95624,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -96411,7 +96481,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB59_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -96444,7 +96514,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -97008,25 +97078,10 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -97205,6 +97260,22 @@ define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -100013,10 +100084,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -100033,6 +100101,9 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -100598,7 +100669,7 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -103065,7 +103136,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -103098,7 +103169,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -103131,7 +103202,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -103933,7 +104004,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -103966,7 +104037,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -103999,7 +104070,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -104039,7 +104110,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -104072,7 +104143,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -104105,7 +104176,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -104797,7 +104868,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -104830,7 +104901,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -104863,7 +104934,7 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -105669,22 +105740,6 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -105704,6 +105759,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -106382,25 +106453,10 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -106413,6 +106469,21 @@ define inreg <64 x half> @bitcast_v16i64_to_v64f16_scalar(<16 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -108629,7 +108700,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -108662,7 +108733,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -108695,7 +108766,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -108773,7 +108844,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -108806,7 +108877,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -108839,7 +108910,7 @@ define inreg <16 x i64> @bitcast_v64f16_to_v16i64_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -108897,10 +108968,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16i64_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -108917,6 +108984,10 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -108948,13 +109019,14 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB68_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -109027,6 +109099,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_addc_u32_e32 v28, vcc, 0, v28, vcc
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 3, v29
 ; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 3, v32
 ; SI-NEXT:    v_addc_u32_e32 v31, vcc, 0, v31, vcc
 ; SI-NEXT:    v_alignbit_b32 v33, v31, v32, 16
@@ -109223,22 +109296,6 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -109265,7 +109322,7 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -109278,6 +109335,22 @@ define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -110247,13 +110320,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -110285,6 +110352,12 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -110300,6 +110373,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -110516,6 +110590,7 @@ define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -111689,7 +111764,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -111722,7 +111797,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -111755,7 +111830,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -111833,7 +111908,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -111866,7 +111941,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -111899,7 +111974,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -111957,6 +112032,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v128i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -112089,22 +112180,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr42
@@ -112136,13 +112211,14 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; kill: killed $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB72_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 24
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -112373,6 +112449,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB72_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 24
@@ -112750,24 +112827,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v11
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
@@ -113145,12 +113206,44 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v16f64_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -113253,22 +113346,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; kill: killed $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr39
@@ -113371,132 +113448,132 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[29:30]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[27:28]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[25:26]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[23:24]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[31:32]
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[29:30]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[27:28]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[25:26]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[23:24]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[21:22]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[17:18]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[39:40], 24, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 24, v28
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
@@ -113932,10 +114009,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -114075,7 +114151,17 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v42
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -114092,26 +114178,12 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v16f64_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -114128,6 +114200,9 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
@@ -114260,6 +114335,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; kill: killed $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114319,7 +114395,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(47)
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -114332,7 +114408,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(49)
+; GFX9-NEXT:    s_waitcnt vmcnt(33)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
@@ -114340,151 +114416,152 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[19:20]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[17:18]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v20
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v19
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v17
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v17
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
-; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[19:20]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[17:18]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[11:12]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[9:10]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[7:8]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[5:6]
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
@@ -114494,7 +114571,6 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 8, v11
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 16, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v63, 8, v9
@@ -114523,7 +114599,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB72_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    s_waitcnt vmcnt(46)
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
 ; GFX9-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; GFX9-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; GFX9-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
@@ -114828,24 +114904,8 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -115110,6 +115170,22 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -115552,11 +115628,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -115577,6 +115649,10 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -116196,7 +116272,7 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -116980,11 +117056,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v33, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 39
 ; SI-NEXT:    v_mov_b32_e32 v30, s4
-; SI-NEXT:    v_mov_b32_e32 v29, s46
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v29, s98
 ; SI-NEXT:    v_readlane_b32 s4, v61, 40
 ; SI-NEXT:    v_mov_b32_e32 v34, s4
 ; SI-NEXT:    v_readlane_b32 s4, v61, 41
@@ -117077,10 +117148,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v25, s4
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v29, s96
 ; SI-NEXT:    v_readlane_b32 s4, v62, 0
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -117137,69 +117204,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v25, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 14
-; SI-NEXT:    v_mov_b32_e32 v60, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 15
-; SI-NEXT:    v_mov_b32_e32 v31, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 16
-; SI-NEXT:    v_mov_b32_e32 v32, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 17
-; SI-NEXT:    v_mov_b32_e32 v18, s5
-; SI-NEXT:    v_mov_b32_e32 v46, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 0
-; SI-NEXT:    v_readlane_b32 s5, v61, 1
-; SI-NEXT:    v_mov_b32_e32 v59, s17
-; SI-NEXT:    v_mov_b32_e32 v58, s16
-; SI-NEXT:    v_mov_b32_e32 v45, s19
-; SI-NEXT:    v_mov_b32_e32 v44, s18
-; SI-NEXT:    v_mov_b32_e32 v53, s21
-; SI-NEXT:    v_mov_b32_e32 v52, s20
-; SI-NEXT:    v_mov_b32_e32 v39, s23
-; SI-NEXT:    v_mov_b32_e32 v38, s22
-; SI-NEXT:    v_mov_b32_e32 v24, s25
-; SI-NEXT:    v_mov_b32_e32 v23, s24
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v21, s26
-; SI-NEXT:    v_mov_b32_e32 v20, s29
-; SI-NEXT:    v_mov_b32_e32 v19, s28
-; SI-NEXT:    v_mov_b32_e32 v16, s7
-; SI-NEXT:    v_mov_b32_e32 v15, s6
-; SI-NEXT:    v_mov_b32_e32 v14, s9
+; SI-NEXT:    v_mov_b32_e32 v29, s46
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s98
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s96
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s86
-; SI-NEXT:    v_mov_b32_e32 v13, s8
-; SI-NEXT:    v_mov_b32_e32 v12, s11
-; SI-NEXT:    v_mov_b32_e32 v11, s10
-; SI-NEXT:    v_mov_b32_e32 v10, s13
-; SI-NEXT:    v_mov_b32_e32 v9, s12
-; SI-NEXT:    v_mov_b32_e32 v8, s15
-; SI-NEXT:    v_mov_b32_e32 v7, s14
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v5, s40
-; SI-NEXT:    v_mov_b32_e32 v4, s43
-; SI-NEXT:    v_mov_b32_e32 v3, s42
-; SI-NEXT:    v_mov_b32_e32 v2, s45
-; SI-NEXT:    v_mov_b32_e32 v1, s44
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v28, s38
-; SI-NEXT:    v_mov_b32_e32 v27, s36
-; SI-NEXT:    v_mov_b32_e32 v26, s34
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v25, s30
-; SI-NEXT:    v_mov_b32_e32 v56, s94
-; SI-NEXT:    v_mov_b32_e32 v55, s92
-; SI-NEXT:    v_mov_b32_e32 v54, s90
-; SI-NEXT:    v_mov_b32_e32 v42, s88
-; SI-NEXT:    v_mov_b32_e32 v41, s78
-; SI-NEXT:    v_mov_b32_e32 v40, s76
-; SI-NEXT:    v_mov_b32_e32 v50, s74
-; SI-NEXT:    v_mov_b32_e32 v49, s72
-; SI-NEXT:    v_mov_b32_e32 v48, s62
-; SI-NEXT:    v_mov_b32_e32 v47, s60
-; SI-NEXT:    v_mov_b32_e32 v36, s58
-; SI-NEXT:    v_mov_b32_e32 v35, s56
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
@@ -117242,108 +117260,165 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v29, s50
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s4, v62, 14
+; SI-NEXT:    v_mov_b32_e32 v60, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 15
+; SI-NEXT:    v_mov_b32_e32 v31, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 16
+; SI-NEXT:    v_mov_b32_e32 v32, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 17
+; SI-NEXT:    v_mov_b32_e32 v18, s5
+; SI-NEXT:    v_mov_b32_e32 v46, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 0
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 2
-; SI-NEXT:    v_readlane_b32 s5, v61, 3
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    v_readlane_b32 s4, v61, 2
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 4
-; SI-NEXT:    v_readlane_b32 s5, v61, 5
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 3
+; SI-NEXT:    v_readlane_b32 s4, v61, 4
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 6
-; SI-NEXT:    v_readlane_b32 s5, v61, 7
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 5
+; SI-NEXT:    v_readlane_b32 s4, v61, 6
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 8
-; SI-NEXT:    v_readlane_b32 s5, v61, 9
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 7
+; SI-NEXT:    v_readlane_b32 s4, v61, 8
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 10
-; SI-NEXT:    v_readlane_b32 s5, v61, 11
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 9
+; SI-NEXT:    v_readlane_b32 s4, v61, 10
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 12
-; SI-NEXT:    v_readlane_b32 s5, v61, 13
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 11
+; SI-NEXT:    v_readlane_b32 s4, v61, 12
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 14
-; SI-NEXT:    v_readlane_b32 s5, v61, 15
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 13
+; SI-NEXT:    v_readlane_b32 s4, v61, 14
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 16
-; SI-NEXT:    v_readlane_b32 s5, v61, 17
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 15
+; SI-NEXT:    v_readlane_b32 s4, v61, 16
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 18
-; SI-NEXT:    v_readlane_b32 s5, v61, 19
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 17
+; SI-NEXT:    v_readlane_b32 s4, v61, 18
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 20
-; SI-NEXT:    v_readlane_b32 s5, v61, 21
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 19
+; SI-NEXT:    v_readlane_b32 s4, v61, 20
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 22
-; SI-NEXT:    v_readlane_b32 s5, v61, 23
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 21
+; SI-NEXT:    v_readlane_b32 s4, v61, 22
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 24
-; SI-NEXT:    v_readlane_b32 s5, v61, 25
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 23
+; SI-NEXT:    v_readlane_b32 s4, v61, 24
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 26
-; SI-NEXT:    v_readlane_b32 s5, v61, 27
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 25
+; SI-NEXT:    v_readlane_b32 s4, v61, 26
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 28
-; SI-NEXT:    v_readlane_b32 s5, v61, 29
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 27
+; SI-NEXT:    v_readlane_b32 s4, v61, 28
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 30
-; SI-NEXT:    v_readlane_b32 s5, v61, 31
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 29
+; SI-NEXT:    v_readlane_b32 s4, v61, 30
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
-; SI-NEXT:    v_readlane_b32 s4, v61, 32
-; SI-NEXT:    v_readlane_b32 s5, v61, 33
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s48
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 31
+; SI-NEXT:    v_readlane_b32 s4, v61, 32
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_mov_b32_e32 v59, s17
+; SI-NEXT:    v_mov_b32_e32 v58, s16
+; SI-NEXT:    v_mov_b32_e32 v45, s19
+; SI-NEXT:    v_mov_b32_e32 v44, s18
+; SI-NEXT:    v_mov_b32_e32 v53, s21
+; SI-NEXT:    v_mov_b32_e32 v52, s20
+; SI-NEXT:    v_mov_b32_e32 v39, s23
+; SI-NEXT:    v_mov_b32_e32 v38, s22
+; SI-NEXT:    v_mov_b32_e32 v24, s25
+; SI-NEXT:    v_mov_b32_e32 v23, s24
+; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    v_mov_b32_e32 v21, s26
+; SI-NEXT:    v_mov_b32_e32 v20, s29
+; SI-NEXT:    v_mov_b32_e32 v19, s28
+; SI-NEXT:    v_mov_b32_e32 v16, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v14, s9
+; SI-NEXT:    v_mov_b32_e32 v13, s8
+; SI-NEXT:    v_mov_b32_e32 v12, s11
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v10, s13
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v8, s15
+; SI-NEXT:    v_mov_b32_e32 v7, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s41
+; SI-NEXT:    v_mov_b32_e32 v5, s40
+; SI-NEXT:    v_mov_b32_e32 v4, s43
+; SI-NEXT:    v_mov_b32_e32 v3, s42
+; SI-NEXT:    v_mov_b32_e32 v2, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    v_mov_b32_e32 v28, s38
+; SI-NEXT:    v_mov_b32_e32 v27, s36
+; SI-NEXT:    v_mov_b32_e32 v26, s34
+; SI-NEXT:    v_mov_b32_e32 v25, s30
+; SI-NEXT:    v_mov_b32_e32 v56, s94
+; SI-NEXT:    v_mov_b32_e32 v55, s92
+; SI-NEXT:    v_mov_b32_e32 v54, s90
+; SI-NEXT:    v_mov_b32_e32 v42, s88
+; SI-NEXT:    v_mov_b32_e32 v41, s78
+; SI-NEXT:    v_mov_b32_e32 v40, s76
+; SI-NEXT:    v_mov_b32_e32 v50, s74
+; SI-NEXT:    v_mov_b32_e32 v49, s72
+; SI-NEXT:    v_mov_b32_e32 v48, s62
+; SI-NEXT:    v_mov_b32_e32 v47, s60
+; SI-NEXT:    v_mov_b32_e32 v36, s58
+; SI-NEXT:    v_mov_b32_e32 v35, s56
+; SI-NEXT:    v_readlane_b32 s5, v61, 33
 ; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB73_5: ; %end
@@ -117636,9 +117711,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
@@ -117927,6 +118002,15 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v43
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v37
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -117940,15 +118024,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v37
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
@@ -118615,10 +118690,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 11
 ; VI-NEXT:    v_mov_b32_e32 v41, s4
-; VI-NEXT:    v_mov_b32_e32 v40, s48
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s38
 ; VI-NEXT:    v_readlane_b32 s4, v62, 12
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
@@ -118656,9 +118727,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 25
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s36
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 26
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -118696,9 +118764,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s34
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
@@ -118714,6 +118779,52 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 42
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
+; VI-NEXT:    v_mov_b32_e32 v40, s48
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s38
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s36
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s34
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s30
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s90
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s88
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s78
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s76
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s74
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s72
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s62
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s60
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s58
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v40, s56
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_readlane_b32 s4, v62, 43
 ; VI-NEXT:    v_mov_b32_e32 v53, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 44
@@ -118723,7 +118834,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 46
 ; VI-NEXT:    v_mov_b32_e32 v58, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 47
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v35, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 48
 ; VI-NEXT:    v_mov_b32_e32 v54, s4
@@ -118736,17 +118846,17 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s4, v62, 52
 ; VI-NEXT:    v_mov_b32_e32 v39, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 53
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s30
 ; VI-NEXT:    v_mov_b32_e32 v49, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 54
 ; VI-NEXT:    v_mov_b32_e32 v61, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 55
 ; VI-NEXT:    v_mov_b32_e32 v36, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 56
+; VI-NEXT:    v_mov_b32_e32 v40, s46
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v12, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
 ; VI-NEXT:    v_mov_b32_e32 v2, s45
@@ -118776,48 +118886,13 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v28, s21
 ; VI-NEXT:    v_mov_b32_e32 v29, s18
 ; VI-NEXT:    v_mov_b32_e32 v30, s19
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s90
 ; VI-NEXT:    v_mov_b32_e32 v31, s16
 ; VI-NEXT:    v_mov_b32_e32 v32, s17
 ; VI-NEXT:    v_mov_b32_e32 v42, s70
 ; VI-NEXT:    v_mov_b32_e32 v50, s4
+; VI-NEXT:    v_mov_b32_e32 v40, v43
 ; VI-NEXT:    v_mov_b32_e32 v46, v38
 ; VI-NEXT:    v_mov_b32_e32 v38, v34
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s88
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s78
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s76
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s74
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s72
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s62
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s60
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s58
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s56
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, s46
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v40, v43
 ; VI-NEXT:  .LBB73_5: ; %end
 ; VI-NEXT:    v_lshlrev_b32_e32 v34, 8, v42
 ; VI-NEXT:    v_or_b32_sdwa v31, v31, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -119216,7 +119291,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -119231,9 +119309,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
@@ -119831,12 +119906,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    ; implicit-def: $sgpr46
 ; GFX9-NEXT:    s_branch .LBB73_2
 ; GFX9-NEXT:  .LBB73_4:
-; GFX9-NEXT:    v_mov_b32_e32 v41, s66
-; GFX9-NEXT:    v_mov_b32_e32 v40, s36
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s34
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s81
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s71
@@ -119913,10 +119982,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 9
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s30
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 10
@@ -119975,10 +120040,71 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 28
 ; GFX9-NEXT:    v_mov_b32_e32 v29, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 29
+; GFX9-NEXT:    v_mov_b32_e32 v41, s66
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
-; GFX9-NEXT:    v_readlane_b32 s4, v62, 30
+; GFX9-NEXT:    v_mov_b32_e32 v40, s36
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s34
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s30
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s94
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s92
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s90
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s88
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s78
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s76
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s74
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s72
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s62
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s60
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s58
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, s56
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_readlane_b32 s4, v62, 30
 ; GFX9-NEXT:    v_mov_b32_e32 v15, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 31
 ; GFX9-NEXT:    v_mov_b32_e32 v44, s4
@@ -119993,10 +120119,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 36
 ; GFX9-NEXT:    v_mov_b32_e32 v55, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 37
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s94
 ; GFX9-NEXT:    v_mov_b32_e32 v61, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 38
 ; GFX9-NEXT:    v_mov_b32_e32 v42, s4
@@ -120021,6 +120143,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 48
 ; GFX9-NEXT:    v_mov_b32_e32 v60, s4
 ; GFX9-NEXT:    v_readlane_b32 s4, v62, 49
+; GFX9-NEXT:    v_mov_b32_e32 v40, s46
 ; GFX9-NEXT:    v_mov_b32_e32 v12, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s44
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s45
@@ -120058,54 +120181,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v54, s64
 ; GFX9-NEXT:    v_mov_b32_e32 v52, s54
 ; GFX9-NEXT:    v_mov_b32_e32 v25, s4
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s92
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s90
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s88
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s78
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s76
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s74
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s72
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s62
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s60
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s58
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s56
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v40, s46
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -120127,8 +120202,6 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v51, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v51, 8, v45
 ; GFX9-NEXT:    v_or_b32_sdwa v48, v48, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v40, 8, v56
 ; GFX9-NEXT:    v_or_b32_sdwa v50, v50, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
@@ -120179,45 +120252,46 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v27, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v44
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v28, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v19, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v30
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v51
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v36, 8, v36
 ; GFX9-NEXT:    v_or_b32_sdwa v23, v23, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v50, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v59
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v60, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v25, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 8, v49
 ; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:12
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v25, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 8, v48
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:8
 ; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v19, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -120231,11 +120305,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v35, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:24
@@ -120247,23 +120319,9 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v33, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:32
@@ -120285,7 +120343,10 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v53, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v32, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 8, v18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -120513,6 +120574,20 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -120524,7 +120599,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
 ; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:88
@@ -120559,7 +120634,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    v_writelane_b32 v77, s101, 5
 ; GFX11-NEXT:    s_mov_b32 vcc_hi, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x13
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:68
@@ -121467,7 +121542,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x13
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:8
@@ -121530,7 +121605,7 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; GFX11-NEXT:    v_readlane_b32 s31, v76, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v76, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
 ; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:88
@@ -121743,26 +121818,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196
@@ -121923,30 +121985,44 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:364
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:372
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB74_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
@@ -121955,11 +122031,11 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
@@ -122562,6 +122638,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB74_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
@@ -122575,8 +122652,8 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
@@ -123256,25 +123333,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -123411,20 +123476,34 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -123910,6 +123989,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(13)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v61
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -124487,27 +124567,13 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -124649,20 +124715,34 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -125149,6 +125229,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(13)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v61
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
@@ -126287,7 +126368,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
@@ -126320,7 +126401,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
@@ -127261,7 +127342,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
 ; GFX11-FAKE16-NEXT:  .LBB74_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
@@ -127294,7 +127375,7 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
@@ -128011,13 +128092,24 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch .LBB75_3
 ; SI-NEXT:  .LBB75_2:
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v55, v56
 ; SI-NEXT:    v_mov_b32_e32 v42, v46
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
@@ -128028,22 +128120,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:  .LBB75_3: ; %Flow
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v35, v57
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
@@ -128053,6 +128133,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -128647,13 +128728,13 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v19
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v21
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v46, 8, v9
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v11
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v13
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v17
 ; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
@@ -128881,11 +128962,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
 ; VI-NEXT:    s_lshl_b32 s5, s29, 8
 ; VI-NEXT:    s_or_b32 s4, s4, s5
@@ -128895,8 +128976,11 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_lshl_b32 s7, s23, 8
 ; VI-NEXT:    s_lshl_b32 s8, s27, 8
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
@@ -128904,8 +128988,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -129114,6 +129196,12 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    s_mov_b64 s[4:5], 0
 ; VI-NEXT:    s_branch .LBB75_3
 ; VI-NEXT:  .LBB75_2:
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v44, v56
 ; VI-NEXT:    v_mov_b32_e32 v41, v33
 ; VI-NEXT:    v_mov_b32_e32 v50, v40
@@ -129131,12 +129219,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v54, v53
 ; VI-NEXT:    v_mov_b32_e32 v52, v36
 ; VI-NEXT:    v_mov_b32_e32 v49, v51
@@ -129146,7 +129228,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:    v_mov_b32_e32 v51, v41
 ; VI-NEXT:    v_mov_b32_e32 v36, v44
 ; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_mov_b32_e32 v54, v60
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -129159,6 +129241,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; VI-NEXT:  ; %bb.4: ; %cmp.true
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v37
 ; VI-NEXT:    s_add_i32 s28, s28, 3
 ; VI-NEXT:    s_and_b32 s4, s28, 0xff
@@ -129743,8 +129826,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 8, v11
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 8, v9
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -129923,18 +130006,16 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_lshl_b32 s6, s19, 8
 ; GFX9-NEXT:    s_lshl_b32 s7, s23, 8
 ; GFX9-NEXT:    s_lshl_b32 s8, s27, 8
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
@@ -129961,8 +130042,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -129978,16 +130060,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
@@ -129999,11 +130079,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130016,12 +130095,10 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
@@ -130035,22 +130112,17 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -130066,24 +130138,45 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(10)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -130134,6 +130227,18 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch .LBB75_3
 ; GFX9-NEXT:  .LBB75_2:
+; GFX9-NEXT:    v_mov_b32_e32 v38, v51
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v43
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
@@ -130147,18 +130252,6 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_mov_b32_e32 v38, v51
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v62
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v31
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v30
@@ -130596,7 +130689,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -130629,7 +130722,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -131486,7 +131579,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-TRUE16-NEXT:  .LBB75_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -131519,7 +131612,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-TRUE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x7
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -131537,7 +131630,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:476
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:472
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:468
@@ -131570,7 +131663,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:360
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:356
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:352
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:348
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:344
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:340
@@ -132427,7 +132520,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-FAKE16-NEXT:  .LBB75_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:328
@@ -132460,7 +132553,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a,
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:444
-; GFX11-FAKE16-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x7
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:456
@@ -132495,6 +132588,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -132563,22 +132672,6 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -132610,7 +132703,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -132620,7 +132713,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
@@ -132750,6 +132843,7 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB76_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
 ; SI-NEXT:    v_and_b32_e32 v35, 0xffff0000, v32
@@ -132987,25 +133081,10 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
@@ -133184,6 +133263,22 @@ define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -133871,22 +133966,8 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v40
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v40
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -133974,6 +134055,20 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -135976,10 +136071,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -135996,6 +136088,9 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:8
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -136561,7 +136656,7 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v28, v60 :: v_dual_mov_b32 v29, v61
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v30, v62 :: v_dual_mov_b32 v31, v63
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -139028,7 +139123,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:280
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:276
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:272
@@ -139061,7 +139156,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:164
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:160
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:156
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:152
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:148
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:144
@@ -139094,7 +139189,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v157, s32 offset:36
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v158, s32 offset:32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v159, s32 offset:28
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v168, s32 offset:24
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v169, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v170, s32 offset:16
@@ -139896,7 +139991,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, v172 :: v_dual_mov_b32 v21, v169
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v20, v170 :: v_dual_mov_b32 v23, v183
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, v168 :: v_dual_mov_b32 v25, v181
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v174, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v173, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v172, off, s32 offset:8
@@ -139929,7 +140024,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:136
@@ -139962,7 +140057,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:252
-; GFX11-TRUE16-NEXT:    s_clause 0x6 ; 28-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x6
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:256
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:260
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:264
@@ -140002,7 +140097,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:288
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:284
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:280
@@ -140035,7 +140130,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:172
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:168
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:164
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:160
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:156
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:152
@@ -140068,7 +140163,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v157, s32 offset:44
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v158, s32 offset:40
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v159, s32 offset:36
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v168, s32 offset:32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v169, s32 offset:28
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v170, s32 offset:24
@@ -140760,7 +140855,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, v184 :: v_dual_mov_b32 v23, v174
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v22, v171 :: v_dual_mov_b32 v25, v169
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v26, v170 :: v_dual_mov_b32 v29, v180
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v184, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v175, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v174, off, s32 offset:8
@@ -140793,7 +140888,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:116
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:120
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:124
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:128
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:132
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:136
@@ -140826,7 +140921,7 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:244
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:248
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:252
-; GFX11-FAKE16-NEXT:    s_clause 0x8 ; 36-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x8
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:256
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:260
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:264
@@ -140883,6 +140978,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -140951,22 +141062,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr47
@@ -140998,7 +141093,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; kill: killed $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -141049,6 +141144,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v52
 ; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v31
 ; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
@@ -141218,6 +141314,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
 ; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
 ; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v9
@@ -141565,24 +141662,8 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -141631,6 +141712,22 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -142275,22 +142372,6 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -142310,6 +142391,22 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB81_4:
@@ -144470,7 +144567,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -144503,7 +144600,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -144536,7 +144633,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -144614,7 +144711,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -144647,7 +144744,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -144680,7 +144777,7 @@ define inreg <16 x double> @bitcast_v64f16_to_v16f64_scalar(<64 x half> inreg %a
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -144738,10 +144835,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-LABEL: bitcast_v16f64_to_v64i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -144758,6 +144851,10 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -144789,13 +144886,14 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB84_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v33, v32, v31, 16
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    v_alignbit_b32 v34, v30, v29, 16
@@ -144839,6 +144937,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB84_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
 ; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
@@ -145050,22 +145149,6 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
@@ -145092,7 +145175,7 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -145105,6 +145188,22 @@ define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -145508,23 +145607,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
@@ -145585,6 +145668,22 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB85_4:
@@ -145932,13 +146031,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
@@ -145970,6 +146063,12 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -145985,6 +146084,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v41
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -146201,6 +146301,7 @@ define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
@@ -147374,7 +147475,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:292
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:288
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:284
@@ -147407,7 +147508,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:176
 ; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:168
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:164
 ; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:160
 ; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:156
@@ -147440,7 +147541,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_store_b32 off, v157, s32 offset:48
 ; GFX11-NEXT:    scratch_store_b32 off, v158, s32 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v159, s32 offset:40
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_store_b32 off, v168, s32 offset:36
 ; GFX11-NEXT:    scratch_store_b32 off, v169, s32 offset:32
 ; GFX11-NEXT:    scratch_store_b32 off, v170, s32 offset:28
@@ -147518,7 +147619,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    v_dual_mov_b32 v19, v174 :: v_dual_mov_b32 v20, v173
 ; GFX11-NEXT:    v_dual_mov_b32 v21, v172 :: v_dual_mov_b32 v22, v171
 ; GFX11-NEXT:    v_dual_mov_b32 v23, v170 :: v_dual_mov_b32 v24, v183
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v185, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v184, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v175, off, s32 offset:8
@@ -147551,7 +147652,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:116
 ; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:120
 ; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:124
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:132
 ; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:136
@@ -147584,7 +147685,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a,
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:244
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:248
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:252
-; GFX11-NEXT:    s_clause 0x9 ; 40-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:256
 ; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:260
 ; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:264
@@ -147794,8 +147895,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v25
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:264
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr9
@@ -147805,7 +147904,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:140
@@ -147845,39 +147944,38 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v29
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200
@@ -147893,12 +147991,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:240
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -147920,6 +148017,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:268
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316
@@ -147927,15 +148032,11 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:304
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:296
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:272
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v1
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 24, v2
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v3
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340
@@ -147944,11 +148045,9 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:336
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:328
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v1
 ; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v1
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 24, v2
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372
@@ -147958,7 +148057,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:368
 ; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:364
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:360
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 24, v1
@@ -149458,25 +149557,10 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -149773,6 +149857,22 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -149840,8 +149940,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -149937,25 +150037,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -150083,19 +150171,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150103,6 +150186,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -150111,57 +150214,35 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150194,19 +150275,39 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -150372,9 +150473,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -151059,8 +151168,8 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -151171,27 +151280,13 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -151324,19 +151419,14 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151344,6 +151434,26 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -151352,62 +151462,36 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151430,25 +151514,49 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -151614,9 +151722,17 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -152962,7 +153078,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -152995,7 +153111,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0xf
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -153824,7 +153940,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB88_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -153857,7 +153973,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0xf
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -153902,6 +154018,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -153912,7 +154029,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_mov_b32 s72, s21
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_writelane_b32 v43, s19, 0
 ; SI-NEXT:    v_writelane_b32 v43, s18, 1
 ; SI-NEXT:    v_writelane_b32 v43, s17, 2
@@ -153953,16 +154070,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
-; SI-NEXT:    s_mov_b32 s79, s26
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
+; SI-NEXT:    s_mov_b32 s79, s26
 ; SI-NEXT:    v_readfirstlane_b32 s38, v20
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s39, v19
@@ -153989,7 +154100,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s18, v5
 ; SI-NEXT:    v_readfirstlane_b32 s19, v6
 ; SI-NEXT:    v_readfirstlane_b32 s88, v4
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
@@ -153997,31 +154110,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 4
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
 ; SI-NEXT:    v_writelane_b32 v43, s4, 5
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 6
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 7
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
 ; SI-NEXT:    v_writelane_b32 v43, s4, 8
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
 ; SI-NEXT:    v_writelane_b32 v43, s4, 9
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 10
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    v_readfirstlane_b32 s90, v9
 ; SI-NEXT:    v_readfirstlane_b32 s91, v10
 ; SI-NEXT:    v_readfirstlane_b32 s92, v8
 ; SI-NEXT:    v_readfirstlane_b32 s93, v7
@@ -154104,41 +154219,44 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_readfirstlane_b32 s24, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s78, v34
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
-; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    v_writelane_b32 v43, s4, 20
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 21
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 22
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
 ; SI-NEXT:    v_writelane_b32 v43, s4, 23
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
 ; SI-NEXT:    v_writelane_b32 v43, s4, 24
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
 ; SI-NEXT:    v_writelane_b32 v43, s4, 25
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
 ; SI-NEXT:    v_writelane_b32 v43, s4, 26
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
 ; SI-NEXT:    v_writelane_b32 v43, s4, 27
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -154152,7 +154270,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
 ; SI-NEXT:    v_writelane_b32 v43, s4, 28
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v52
 ; SI-NEXT:    v_writelane_b32 v43, s4, 29
 ; SI-NEXT:    v_readfirstlane_b32 s4, v53
@@ -154161,6 +154279,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v43, s4, 31
 ; SI-NEXT:    v_readfirstlane_b32 s4, v55
 ; SI-NEXT:    v_writelane_b32 v43, s4, 32
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v40
 ; SI-NEXT:    v_writelane_b32 v43, s4, 33
 ; SI-NEXT:    v_writelane_b32 v43, s22, 34
@@ -155775,53 +155894,33 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -155866,6 +155965,52 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -155885,7 +156030,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -155894,6 +156038,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -155925,25 +156070,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB89_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -155968,18 +156094,15 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -156029,11 +156152,10 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -156041,37 +156163,50 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -156086,12 +156221,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -156113,28 +156249,21 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -156152,10 +156281,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -156188,6 +156318,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    s_branch .LBB89_3
 ; VI-NEXT:  .LBB89_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -156208,7 +156339,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -156800,51 +156930,29 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -156908,42 +157016,82 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -156964,13 +157112,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(55)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB89_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -157224,13 +157365,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -157240,6 +157382,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:  .LBB89_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -157251,7 +157394,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -157717,7 +157859,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -158447,7 +158589,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB89_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -158489,7 +158631,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1e
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -159273,7 +159415,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB89_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1e
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -161295,23 +161437,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v58
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
@@ -161358,15 +161484,28 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -161383,6 +161522,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr34
@@ -161571,165 +161713,166 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v29
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v28
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v28
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v28
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v27
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v27
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v26
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v26
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v25
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v25
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v24
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v24
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v23
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v22
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v22
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v22
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v21
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[3:4]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v21
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v20
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v20
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v19
-; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v18
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v15
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v18
-; VI-NEXT:    v_mov_b32_e32 v45, v46
-; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[29:30]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v31
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v17
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[5:6]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[3:4]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[1:2]
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[31:32]
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v12
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v45, v46
+; VI-NEXT:    v_lshrrev_b64 v[46:47], 24, v[29:30]
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 8, v31
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v46, v63
 ; VI-NEXT:    v_mov_b32_e32 v63, v50
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[27:28]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v13
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v57, 16, v9
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
 ; VI-NEXT:    v_mov_b32_e32 v51, v57
 ; VI-NEXT:    v_mov_b32_e32 v50, v56
 ; VI-NEXT:    v_lshrrev_b64 v[56:57], 24, v[25:26]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
 ; VI-NEXT:    v_mov_b32_e32 v57, v43
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[23:24]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[21:22]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v3
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[19:20]
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v1
 ; VI-NEXT:    v_lshrrev_b64 v[43:44], 24, v[17:18]
 ; VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v49, 8, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
@@ -161742,7 +161885,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 16, v6
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v32
@@ -162376,27 +162518,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[21:22]
-; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
+; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v28
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v28
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v28
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v27
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v27
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[17:18]
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v26
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v26
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 8, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
@@ -162781,10 +162923,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -162879,6 +163020,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
+; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -162895,16 +163046,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 0x78, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 0x7c, v0
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -163141,11 +163282,49 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v24
+; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
+; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
+; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
+; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v19
+; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
+; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
+; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v18
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -163159,7 +163338,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v63
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(44)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v62
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v62
@@ -163177,168 +163355,130 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v6
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v53, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v51, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
-; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[15:16]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[11:12]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[9:10]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[5:6]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[3:4]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[1:2]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[62:63]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[29:30]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[27:28]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[25:26]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[23:24]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[21:22]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[19:20]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    v_lshrrev_b64 v[58:59], 24, v[17:18]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v24
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v23
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v22
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v22
-; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v21
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v21
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v19
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v60, 8, v18
-; GFX9-NEXT:    v_lshrrev_b32_e32 v61, 16, v17
-; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
@@ -163431,11 +163571,16 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v18, v19, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX9-NEXT:    v_mov_b32_e32 v59, v32
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v21
+; GFX9-NEXT:    v_mov_b32_e32 v58, v31
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v14, v13, v0, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v18, v19, vcc
@@ -163590,7 +163735,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    s_waitcnt vmcnt(50)
+; GFX9-NEXT:    s_waitcnt vmcnt(52)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v62
 ; GFX9-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; GFX9-NEXT:    v_cndmask_b32_e32 v44, v18, v19, vcc
@@ -163605,6 +163750,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v17, s6
 ; GFX9-NEXT:    v_or_b32_e32 v19, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
@@ -163745,10 +163891,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
 ; GFX9-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v59, v32
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_mov_b32_e32 v58, v31
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s6
 ; GFX9-NEXT:    v_or_b32_e32 v31, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
@@ -163814,7 +163958,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v41, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v61, v28, v0, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v32, v41, vcc
 ; GFX9-NEXT:    v_add3_u32 v31, v31, v13, s6
@@ -163822,7 +163965,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc
 ; GFX9-NEXT:    v_perm_b32 v41, v13, v0, s7
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v16
 ; GFX9-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; GFX9-NEXT:    v_bfe_u32 v31, v13, 16, 1
@@ -163851,14 +163994,24 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v45, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX9-NEXT:    v_cndmask_b32_e32 v15, v31, v45, vcc
-; GFX9-NEXT:    v_perm_b32 v32, v16, v13, s7
 ; GFX9-NEXT:    v_perm_b32 v31, v15, v26, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
+; GFX9-NEXT:    v_perm_b32 v32, v16, v13, s7
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v14
+; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
@@ -163878,19 +164031,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_perm_b32 v34, v30, v27, s7
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v27
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v36, v44, v29, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 16, v29
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 16, v24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX9-NEXT:    v_perm_b32 v42, v14, v11, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
 ; GFX9-NEXT:    v_perm_b32 v38, v21, v43, s7
@@ -163899,24 +164045,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v20
-; GFX9-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_perm_b32 v51, v6, v17, s7
 ; GFX9-NEXT:    v_perm_b32 v40, v10, v7, s7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
@@ -163924,8 +164052,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v57
+; GFX9-NEXT:    v_perm_b32 v55, v12, v9, s7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v47
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -163969,6 +164101,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[31:32]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
@@ -164001,51 +164134,74 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[35:36]
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[33:34]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[60:61]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[62:63]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[43:44]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[33:34]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[60:61]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[62:63]
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v53
@@ -164058,26 +164214,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v41, 8, v37
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v36
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[43:44]
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v35
 ; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v34
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v33
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
@@ -164086,9 +164231,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v60
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v60
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
@@ -164113,33 +164255,31 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v52, 8, v49
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 16, v48
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v63, v16
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v35
-; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
 ; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v62, v15
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_mov_b32_e32 v63, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; GFX9-NEXT:    v_lshrrev_b32_e32 v40, 8, v40
+; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 8, v48
+; GFX9-NEXT:    v_mov_b32_e32 v62, v15
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v49, 8, v35
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v34
+; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v44
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v50, 8, v44
-; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v43
-; GFX9-NEXT:    s_waitcnt vmcnt(24)
-; GFX9-NEXT:    v_lshrrev_b64 v[56:57], 24, v[58:59]
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v57, 8, v34
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 16, v43
+; GFX9-NEXT:    v_lshrrev_b32_e32 v55, 8, v43
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 24, v59
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v43, 8, v58
-; GFX9-NEXT:    s_waitcnt vmcnt(19)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v60
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v47, 8, v61
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v45, 8, v60
@@ -164154,10 +164294,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v10, 8, v54
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v32, 8, v32
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v31, 8, v31
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -164166,8 +164302,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v31, 8, v39
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v51
@@ -164176,29 +164310,38 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v52
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v11
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v12, 8, v12
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v14, 8, v14
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v43
 ; GFX9-NEXT:    v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v60
 ; GFX9-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v31
 ; GFX9-NEXT:    v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -164373,13 +164516,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:76
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v55
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:80
@@ -164390,13 +164533,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:84
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v57
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v0, v46, s[0:3], 0 offen offset:88
@@ -164536,11 +164679,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x2
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v80, off, s32
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:248
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:244
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:240
@@ -164573,7 +164712,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:124
-; GFX11-TRUE16-NEXT:    s_clause 0x1b ; 112-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1b
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:120
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:116
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:112
@@ -164602,6 +164741,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:20
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v155, s32 offset:12
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v80, off, s32
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr152_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
@@ -165635,7 +165778,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-TRUE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v155, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:16
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:20
@@ -165668,7 +165811,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:128
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:132
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    s_clause 0x1b ; 112-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1b
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:140
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:144
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:148
@@ -165703,11 +165846,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x15 ; 88-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x15
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:92
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:88
@@ -165730,6 +165869,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
@@ -166848,7 +166991,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x15 ; 88-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x15
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:20
@@ -168520,26 +168663,13 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v46
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    s_lshl_b32 s5, s86, 24
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s86, v63, 30
 ; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -168578,6 +168708,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -169780,15 +169924,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v43, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 13
 ; VI-NEXT:    v_mov_b32_e32 v46, s4
-; VI-NEXT:    v_mov_b32_e32 v45, s72
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s74
-; VI-NEXT:    v_mov_b32_e32 v42, s54
-; VI-NEXT:    v_mov_b32_e32 v41, s46
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v41, s56
 ; VI-NEXT:    v_readlane_b32 s4, v62, 14
 ; VI-NEXT:    v_mov_b32_e32 v50, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 15
@@ -169814,11 +169949,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s4, v62, 22
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v45, s76
 ; VI-NEXT:    v_readlane_b32 s4, v62, 23
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
@@ -169864,8 +169994,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_readlane_b32 s4, v62, 37
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_readlane_b32 s4, v62, 38
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
@@ -169924,47 +170052,45 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
 ; VI-NEXT:    v_readlane_b32 s4, v62, 57
+; VI-NEXT:    v_mov_b32_e32 v42, s54
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, s4
-; VI-NEXT:    v_mov_b32_e32 v36, s66
+; VI-NEXT:    v_mov_b32_e32 v41, s46
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v41, s56
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v41, s58
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v41, s60
+; VI-NEXT:    v_mov_b32_e32 v45, s72
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v45, s74
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v45, s76
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v45, s78
 ; VI-NEXT:    v_mov_b32_e32 v55, s88
-; VI-NEXT:    v_mov_b32_e32 v35, s30
-; VI-NEXT:    v_mov_b32_e32 v41, s58
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v35, s85
-; VI-NEXT:    v_mov_b32_e32 v34, s38
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v36, s66
 ; VI-NEXT:    v_mov_b32_e32 v52, s64
-; VI-NEXT:    v_mov_b32_e32 v59, s87
-; VI-NEXT:    v_mov_b32_e32 v41, s60
 ; VI-NEXT:    v_mov_b32_e32 v55, v50
+; VI-NEXT:    v_mov_b32_e32 v35, s30
+; VI-NEXT:    v_mov_b32_e32 v59, s87
 ; VI-NEXT:    v_mov_b32_e32 v58, s34
 ; VI-NEXT:    v_mov_b32_e32 v45, s36
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v50, v46
-; VI-NEXT:    v_mov_b32_e32 v46, v48
-; VI-NEXT:    v_mov_b32_e32 v48, v47
-; VI-NEXT:    v_mov_b32_e32 v47, v56
-; VI-NEXT:    v_mov_b32_e32 v56, v51
-; VI-NEXT:    v_mov_b32_e32 v51, s90
-; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v34, s48
+; VI-NEXT:    v_mov_b32_e32 v34, s38
 ; VI-NEXT:    v_mov_b32_e32 v1, s44
 ; VI-NEXT:    v_mov_b32_e32 v2, s45
 ; VI-NEXT:    v_mov_b32_e32 v3, s42
@@ -169997,19 +170123,37 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_mov_b32_e32 v30, s29
 ; VI-NEXT:    v_mov_b32_e32 v32, s5
 ; VI-NEXT:    v_mov_b32_e32 v41, s62
-; VI-NEXT:    v_mov_b32_e32 v51, v53
-; VI-NEXT:    v_mov_b32_e32 v53, v54
-; VI-NEXT:    v_mov_b32_e32 v54, v40
-; VI-NEXT:    v_mov_b32_e32 v40, s80
 ; VI-NEXT:    v_mov_b32_e32 v57, s81
 ; VI-NEXT:    v_mov_b32_e32 v37, s84
-; VI-NEXT:    v_mov_b32_e32 v58, s50
 ; VI-NEXT:    v_mov_b32_e32 v60, s52
 ; VI-NEXT:    v_mov_b32_e32 v38, s51
 ; VI-NEXT:    v_mov_b32_e32 v61, s65
 ; VI-NEXT:    v_mov_b32_e32 v49, s66
-; VI-NEXT:    v_mov_b32_e32 v45, s53
 ; VI-NEXT:    v_mov_b32_e32 v39, s55
+; VI-NEXT:    v_mov_b32_e32 v50, v46
+; VI-NEXT:    v_mov_b32_e32 v46, v48
+; VI-NEXT:    v_mov_b32_e32 v48, v47
+; VI-NEXT:    v_mov_b32_e32 v47, v56
+; VI-NEXT:    v_mov_b32_e32 v56, v51
+; VI-NEXT:    v_mov_b32_e32 v51, s90
+; VI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v35, s85
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v34, s48
+; VI-NEXT:    v_mov_b32_e32 v51, v53
+; VI-NEXT:    v_mov_b32_e32 v53, v54
+; VI-NEXT:    v_mov_b32_e32 v54, v40
+; VI-NEXT:    v_mov_b32_e32 v40, s80
+; VI-NEXT:    v_mov_b32_e32 v58, s50
+; VI-NEXT:    v_mov_b32_e32 v45, s53
 ; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB91_5: ; %end
@@ -170318,10 +170462,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -170399,23 +170542,9 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; VI-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -170433,6 +170562,20 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -172021,7 +172164,11 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -172036,10 +172183,6 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
@@ -172051,7 +172194,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
@@ -173601,7 +173744,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-TRUE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x3
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
@@ -173614,7 +173757,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x3
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:8
@@ -175171,7 +175314,7 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-FAKE16-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x3
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:4
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:8
@@ -175345,6 +175488,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v37
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v38
@@ -175362,9 +175508,6 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:220
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:192
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:200
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr37
@@ -175382,15 +175525,15 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:208
@@ -175526,37 +175669,34 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:384
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:360
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:384
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
@@ -175576,10 +175716,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:120
@@ -175589,9 +175726,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216
@@ -175617,6 +175752,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:376
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:56
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
@@ -175739,6 +175882,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v47
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v42
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
@@ -176396,18 +176540,25 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
 ; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
 ; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
 ; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
 ; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
 ; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
@@ -177114,24 +177265,9 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -177438,6 +177574,22 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -177505,8 +177657,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -177602,25 +177754,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -177748,19 +177888,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177768,6 +177903,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -177776,57 +177931,35 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -177859,19 +177992,39 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -178037,9 +178190,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -178724,8 +178885,8 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -178836,27 +178997,13 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -178989,19 +179136,14 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179009,6 +179151,26 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -179017,62 +179179,36 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179095,25 +179231,49 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -179279,9 +179439,17 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -180627,7 +180795,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -180660,7 +180828,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0xf
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -181489,7 +181657,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB92_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -181522,7 +181690,7 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0xf
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -183064,6 +183232,17 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v60
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x78, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
@@ -183077,17 +183256,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x78, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
@@ -183347,53 +183515,33 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -183438,6 +183586,52 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -183457,7 +183651,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -183466,6 +183659,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -183497,25 +183691,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB93_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -183540,18 +183715,15 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -183601,11 +183773,10 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -183613,37 +183784,50 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -183658,12 +183842,13 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -183685,28 +183870,21 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -183724,10 +183902,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -183760,6 +183939,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    s_branch .LBB93_3
 ; VI-NEXT:  .LBB93_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -183780,7 +183960,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -184372,51 +184551,29 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -184480,42 +184637,82 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -184536,13 +184733,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(55)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB93_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -184796,13 +184986,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -184812,6 +185003,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:  .LBB93_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -184823,7 +185015,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -185289,7 +185480,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -186019,7 +186210,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB93_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -186061,7 +186252,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1e
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -186845,7 +187036,7 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB93_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1e
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -188862,6 +189053,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -188878,13 +189076,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -188907,42 +189098,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v44, v12
 ; VI-NEXT:    v_mov_b32_e32 v12, v0
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v32, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v43, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
+; VI-NEXT:    v_mov_b32_e32 v32, v20
 ; VI-NEXT:    v_mov_b32_e32 v55, v22
 ; VI-NEXT:    v_mov_b32_e32 v54, v21
 ; VI-NEXT:    v_mov_b32_e32 v31, v19
-; VI-NEXT:    v_mov_b32_e32 v43, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v44
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v43
 ; VI-NEXT:    ; implicit-def: $vgpr20
 ; VI-NEXT:    ; implicit-def: $vgpr57
 ; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr8
-; VI-NEXT:    ; implicit-def: $vgpr4
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr56
 ; VI-NEXT:    ; implicit-def: $vgpr63
@@ -188954,38 +189130,47 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v43
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
+; VI-NEXT:    ; implicit-def: $vgpr8
 ; VI-NEXT:    ; implicit-def: $vgpr15
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr13
 ; VI-NEXT:    ; implicit-def: $vgpr9
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr5
+; VI-NEXT:    ; implicit-def: $vgpr4
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr30
-; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
@@ -188994,6 +189179,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr29
 ; VI-NEXT:    ; implicit-def: $vgpr28
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT:    ; implicit-def: $vgpr27
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr24
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr34
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr0
@@ -189037,34 +189254,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:    ; kill: killed $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr0
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; VI-NEXT:    ; kill: killed $vgpr0
 ; VI-NEXT:    ; implicit-def: $vgpr0
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr5
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v27
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v26
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT:    ; implicit-def: $vgpr27
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr25
-; VI-NEXT:    ; implicit-def: $vgpr24
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v23
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v55
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr23
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    ; implicit-def: $vgpr10
@@ -189102,49 +189293,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v56, v38
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v45, v7
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v63, v53
 ; VI-NEXT:    v_mov_b32_e32 v15, v3
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v28, v48
 ; VI-NEXT:    v_mov_b32_e32 v48, v16
 ; VI-NEXT:    v_mov_b32_e32 v16, v40
 ; VI-NEXT:    v_mov_b32_e32 v47, v39
-; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v63, v53
-; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v32
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v17
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v29, 24, v44
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v32
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 24, v18
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v38
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v37
@@ -189156,20 +189326,83 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT:    v_mov_b32_e32 v62, v36
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v11
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v10
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; VI-NEXT:    v_mov_b32_e32 v62, v36
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
+; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
+; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v31
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v61, 8, v17
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[37:38]
@@ -189184,94 +189417,61 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v33
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v38
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v57, 24, v53
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v53
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v52
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 24, v27
-; VI-NEXT:    v_lshrrev_b32_e32 v42, 24, v34
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[2:3]
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[26:27]
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
+; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
+; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[52:53]
 ; VI-NEXT:    v_lshrrev_b64 v[52:53], 24, v[58:59]
-; VI-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_mov_b32_e32 v53, v63
-; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v40
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
+; VI-NEXT:    v_mov_b32_e32 v27, v19
+; VI-NEXT:    v_mov_b32_e32 v34, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
 ; VI-NEXT:    v_mov_b32_e32 v7, v45
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
+; VI-NEXT:    v_mov_b32_e32 v3, v15
+; VI-NEXT:    v_mov_b32_e32 v15, v29
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
+; VI-NEXT:    v_mov_b32_e32 v38, v56
+; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v45, v60
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
 ; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(13)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v59
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[2:3]
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v58
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v26
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[26:27]
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v34
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v36
-; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v36
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v35
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v49
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v50
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[35:36]
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v50
+; VI-NEXT:    v_lshrrev_b32_e32 v51, 24, v40
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[49:50]
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[39:40]
 ; VI-NEXT:    v_mov_b32_e32 v58, v51
-; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[33:34]
 ; VI-NEXT:    v_mov_b32_e32 v36, v62
 ; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[54:55]
-; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
 ; VI-NEXT:    v_lshrrev_b64 v[50:51], 24, v[17:18]
-; VI-NEXT:    v_mov_b32_e32 v27, v19
-; VI-NEXT:    v_mov_b32_e32 v34, v14
-; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; VI-NEXT:    v_lshrrev_b32_e32 v63, 8, v40
 ; VI-NEXT:    v_mov_b32_e32 v40, v16
 ; VI-NEXT:    v_mov_b32_e32 v16, v48
 ; VI-NEXT:    v_mov_b32_e32 v48, v28
 ; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 24, v55
-; VI-NEXT:    v_mov_b32_e32 v3, v15
-; VI-NEXT:    v_mov_b32_e32 v15, v29
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v17
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v38, v56
+; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v39
-; VI-NEXT:    v_mov_b32_e32 v29, v41
 ; VI-NEXT:    v_mov_b32_e32 v39, v47
 ; VI-NEXT:    v_mov_b32_e32 v47, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v54
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v55
 ; VI-NEXT:  .LBB94_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; VI-NEXT:    s_cbranch_execz .LBB94_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v63, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v21
@@ -189290,47 +189490,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v31, 0x200, v31
 ; VI-NEXT:    v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_e32 v14, v31, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_e32 v62, v55, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
 ; VI-NEXT:    v_or_b32_e32 v61, v54, v0
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v26, v54
 ; VI-NEXT:    v_mov_b32_e32 v27, v55
-; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v60
 ; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
 ; VI-NEXT:    v_or_b32_e32 v34, v25, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v33, v24, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -189338,21 +189527,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v36, v2, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v35, v1, v0
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
-; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
@@ -189361,34 +189542,38 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v38, v2, v0
 ; VI-NEXT:    v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v37, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
-; VI-NEXT:    v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
 ; VI-NEXT:    v_or_b32_e32 v49, v9, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
 ; VI-NEXT:    v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v48, v8, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v47
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v31
 ; VI-NEXT:    v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v10, v32
@@ -189406,11 +189591,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
 ; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v53, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; VI-NEXT:    v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v44, 0x200, v44
 ; VI-NEXT:    v_or_b32_e32 v52, v1, v0
@@ -189427,32 +189612,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v46, v2, v0
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; VI-NEXT:    v_or_b32_e32 v45, v1, v0
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v5, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v4, v6, v0
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
 ; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
@@ -189460,13 +189641,36 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v41, v7, v0
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
+; VI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_e32 v40, v6, v0
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v25, 0x200, v25
+; VI-NEXT:    v_add_f16_e32 v24, 0x200, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v42
 ; VI-NEXT:    v_or_b32_e32 v7, v25, v0
 ; VI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v19
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v54, 0x200, v54
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v25, 8, v46
 ; VI-NEXT:    v_or_b32_e32 v6, v24, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
@@ -189475,6 +189679,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v31, v43, v0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v28
 ; VI-NEXT:    v_or_b32_e32 v30, v2, v0
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_add_f16_e32 v55, 0x200, v55
 ; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -189490,6 +189695,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v0
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
@@ -189507,21 +189714,21 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
-; VI-NEXT:    v_mov_b32_e32 v32, v10
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
-; VI-NEXT:    v_mov_b32_e32 v31, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v41
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[0:1], 24, v[6:7]
+; VI-NEXT:    v_mov_b32_e32 v32, v10
+; VI-NEXT:    v_mov_b32_e32 v31, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v41
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v11
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[10:11], 24, v[40:41]
-; VI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v55, v27
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v54, v26
 ; VI-NEXT:    v_mov_b32_e32 v26, v20
 ; VI-NEXT:    v_lshrrev_b32_e32 v20, 8, v5
@@ -189529,14 +189736,23 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v5, v22
 ; VI-NEXT:    v_mov_b32_e32 v13, v21
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[45:46]
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v53
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v52
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[50:51]
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v50
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v48
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[48:49]
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v49
+; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v36
@@ -189544,39 +189760,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[35:36], 24, v[35:36]
 ; VI-NEXT:    v_mov_b32_e32 v36, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v15
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v62
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v61
+; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[61:62]
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v14
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v53
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v52
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v50
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v49
 ; VI-NEXT:    v_mov_b32_e32 v48, v56
 ; VI-NEXT:    v_lshrrev_b32_e32 v56, 8, v33
 ; VI-NEXT:    v_lshrrev_b64 v[49:50], 24, v[33:34]
 ; VI-NEXT:    v_lshrrev_b64 v[33:34], 24, v[14:15]
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v58
-; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 8, v62
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 8, v61
-; VI-NEXT:    v_lshrrev_b64 v[61:62], 24, v[61:62]
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v57
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v23
 ; VI-NEXT:    v_lshrrev_b32_e32 v23, 8, v40
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v14, v8
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v40, v42
 ; VI-NEXT:    v_bfe_u32 v8, v42, 8, 8
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v38
 ; VI-NEXT:    v_lshrrev_b32_e32 v22, 8, v37
 ; VI-NEXT:    v_lshrrev_b64 v[37:38], 24, v[37:38]
@@ -189593,24 +189797,26 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_bfe_u32 v51, v48, 8, 8
 ; VI-NEXT:    v_bfe_u32 v57, v7, 8, 8
 ; VI-NEXT:    v_bfe_u32 v58, v60, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_bfe_u32 v2, v2, 8, 8
+; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; VI-NEXT:    v_bfe_u32 v34, v47, 8, 8
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v9, v9, 8, 8
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v5, v5, 8, 8
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_bfe_u32 v13, v13, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(12)
-; VI-NEXT:    v_bfe_u32 v2, v2, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_bfe_u32 v42, v0, 8, 8
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_bfe_u32 v34, v62, 8, 8
-; VI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; VI-NEXT:    v_bfe_u32 v34, v47, 8, 8
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bfe_u32 v0, v0, 8, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
@@ -189660,25 +189866,27 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v25
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 12, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v10
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v24
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 16, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v20
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v57
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189690,13 +189898,9 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v23
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 24, v12
@@ -189749,9 +189953,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
 ; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189760,11 +189962,14 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 48, v12
@@ -189786,8 +189991,11 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 56, v12
@@ -189795,9 +190003,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v28
 ; VI-NEXT:    v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -189847,9 +190053,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v4
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v61
 ; VI-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x50, v12
@@ -189863,9 +190067,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x54, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v56
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v49
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -189881,35 +190088,35 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x5c, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
 ; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x60, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v30
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v47
-; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x64, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v22
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v37
-; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x68, v12
@@ -189928,10 +190135,13 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -189951,10 +190161,12 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x74, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
@@ -189964,7 +190176,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x78, v12
 ; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v45
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7c, v12
+; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -189981,20 +190201,28 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x7c, v12
-; VI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v64f16_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -190057,23 +190285,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -190104,6 +190315,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -190137,7 +190349,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -190260,100 +190472,101 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(62)
+; GFX9-NEXT:    s_waitcnt vmcnt(46)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
+; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
+; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[19:20]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v10
@@ -190369,7 +190582,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
@@ -190395,7 +190607,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
 ; GFX9-NEXT:    v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
@@ -190936,7 +191148,17 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -190953,18 +191175,6 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -191423,11 +191633,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -191448,6 +191654,10 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
@@ -192083,7 +192293,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -193730,27 +193940,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    v_mov_b32_e32 v4, s6
-; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v47
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_and_b32 s6, s7, 0xff
 ; SI-NEXT:    s_lshl_b32 s7, s51, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v47
 ; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s6, v1
@@ -193783,6 +193979,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_readlane_b32 s45, v62, 17
 ; SI-NEXT:    v_readlane_b32 s43, v62, 23
 ; SI-NEXT:    v_readlane_b32 s41, v62, 29
@@ -193790,7 +194001,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_readlane_b32 s27, v62, 41
 ; SI-NEXT:    v_readlane_b32 s25, v62, 45
 ; SI-NEXT:    v_readlane_b32 s9, v62, 49
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    v_readlane_b32 s99, v63, 35
 ; SI-NEXT:    v_readlane_b32 s98, v63, 34
 ; SI-NEXT:    v_readlane_b32 s97, v63, 33
@@ -194273,6 +194483,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v14, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
@@ -194280,7 +194492,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v13
-; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v12
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[12:13]
@@ -194288,6 +194499,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[1:2]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v19
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
@@ -194295,20 +194512,14 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[3:4]
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v16
 ; VI-NEXT:    v_lshrrev_b64 v[16:17], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v19
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 8, v18
 ; VI-NEXT:    v_lshrrev_b64 v[17:18], 24, v[18:19]
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v31
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v15
-; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v35
 ; VI-NEXT:    v_lshrrev_b64 v[18:19], 24, v[34:35]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
@@ -194343,7 +194554,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_bfe_u32 v11, v52, 8, 8
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 8, v33
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 8, v32
-; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v29
 ; VI-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
@@ -194953,11 +195163,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v27
 ; VI-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -194966,10 +195174,13 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -195101,22 +195312,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -195184,6 +195382,20 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
@@ -195501,42 +195713,42 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
@@ -196188,11 +196400,9 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
@@ -196224,7 +196434,10 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -196475,6 +196688,8 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v24
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -196489,8 +196704,6 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -196502,7 +196715,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -196537,7 +196750,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 s99, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x12
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -197456,7 +197669,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x12
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -197518,7 +197731,7 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -197569,11 +197782,11 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_mov_b32_e32 v57, v5
 ; SI-NEXT:    v_mov_b32_e32 v41, v3
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:392
@@ -197663,30 +197876,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -197694,21 +197884,28 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v17
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v29
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v31
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v31, 8, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v18
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
-; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr27
@@ -197716,211 +197913,240 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:184
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:80
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v10
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:220
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:192
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:216
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:212
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:208
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:228
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:252
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:172
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:204
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:256
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:244
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:240
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:316
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:288
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:312
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:276
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:272
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v9
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:324
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:348
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:320
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:308
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:344
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v11
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:356
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:380
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:352
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:376
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:340
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 24, v9
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:372
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:368
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:332
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:364
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:388
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:384
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v11
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:120
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76
@@ -197932,19 +198158,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
-; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v35
 ; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:56
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
@@ -197980,7 +198202,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:48
@@ -198460,15 +198682,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v25, v6, v13
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v5
-; SI-NEXT:    v_alignbit_b32 v7, v25, v5, 16
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v7, v25, v5, 16
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v26
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v11
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -199696,8 +199918,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
@@ -199714,12 +199942,6 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -199787,8 +200009,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v25
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v3
 ; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v5
 ; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v7
 ; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v9
@@ -199884,25 +200106,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; VI-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
-; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -200030,19 +200240,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v38, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; VI-NEXT:    s_waitcnt vmcnt(4)
-; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v3
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200050,6 +200255,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:52
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -200058,57 +200283,35 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr61
-; VI-NEXT:    ; implicit-def: $vgpr57
-; VI-NEXT:    ; implicit-def: $vgpr59
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr54
 ; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr40
 ; VI-NEXT:    ; implicit-def: $vgpr41
 ; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200141,19 +200344,39 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_or_b32_sdwa v10, v61, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_or_b32_sdwa v11, v57, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_or_b32_sdwa v12, v59, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr61
+; VI-NEXT:    ; implicit-def: $vgpr57
+; VI-NEXT:    ; implicit-def: $vgpr59
+; VI-NEXT:    ; implicit-def: $vgpr47
+; VI-NEXT:    ; implicit-def: $vgpr45
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -200319,9 +200542,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v31, v31, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v30, v30, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -201006,8 +201237,8 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v9
@@ -201118,27 +201349,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:208
 ; GFX9-NEXT:    buffer_load_ushort v3, off, s[0:3], s32 offset:216
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:196
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
@@ -201271,19 +201488,14 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v37, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v1
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:356
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:364
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201291,6 +201503,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32 offset:372
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:384
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:380
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v60, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:52
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -201299,62 +201531,36 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(15)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s6
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(11)
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(10)
-; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr60
-; GFX9-NEXT:    ; implicit-def: $vgpr56
-; GFX9-NEXT:    ; implicit-def: $vgpr58
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
 ; GFX9-NEXT:    ; implicit-def: $vgpr41
 ; GFX9-NEXT:    ; implicit-def: $vgpr40
 ; GFX9-NEXT:    ; implicit-def: $vgpr38
 ; GFX9-NEXT:    ; implicit-def: $vgpr35
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201377,25 +201583,49 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
 ; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
 ; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_or_b32_sdwa v10, v60, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_or_b32_sdwa v12, v58, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_or_b32_sdwa v13, v46, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr60
+; GFX9-NEXT:    ; implicit-def: $vgpr56
+; GFX9-NEXT:    ; implicit-def: $vgpr58
+; GFX9-NEXT:    ; implicit-def: $vgpr46
+; GFX9-NEXT:    ; implicit-def: $vgpr44
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v15, v42, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr42
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -201561,9 +201791,17 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v28, v29, v28, s6
 ; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_sdwa v31, v31, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    ; implicit-def: $vgpr48
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_perm_b32 v29, v30, v29, s6
@@ -202909,7 +203147,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
@@ -202942,7 +203180,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0xf
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
@@ -203771,7 +204009,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
 ; GFX11-FAKE16-NEXT:  .LBB96_4: ; %end
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
@@ -203804,7 +204042,7 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-FAKE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0xf
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
@@ -203849,6 +204087,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
@@ -203858,9 +204097,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_writelane_b32 v41, s30, 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_writelane_b32 v43, s29, 0
 ; SI-NEXT:    v_writelane_b32 v43, s28, 1
 ; SI-NEXT:    v_writelane_b32 v43, s27, 2
@@ -203909,12 +204148,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
 ; SI-NEXT:    v_readfirstlane_b32 s39, v26
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s47, v12
@@ -203937,7 +204170,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s59, v28
 ; SI-NEXT:    v_readfirstlane_b32 s60, v27
 ; SI-NEXT:    v_readfirstlane_b32 s11, v1
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s12, v2
+; SI-NEXT:    v_readfirstlane_b32 s13, v9
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
@@ -203946,28 +204181,30 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 15
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
 ; SI-NEXT:    v_writelane_b32 v43, s4, 16
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 17
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
 ; SI-NEXT:    v_writelane_b32 v43, s4, 18
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s44, v36
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s90, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s6, v38
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_readfirstlane_b32 s13, v9
 ; SI-NEXT:    v_readfirstlane_b32 s14, v10
 ; SI-NEXT:    v_readfirstlane_b32 s15, v8
 ; SI-NEXT:    v_readfirstlane_b32 s18, v7
@@ -203981,10 +204218,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s77, v15
 ; SI-NEXT:    v_readfirstlane_b32 s38, v25
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s93, v55
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s95, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 19
@@ -204061,35 +204294,39 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v43, s4, 30
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
 ; SI-NEXT:    v_writelane_b32 v43, s4, 31
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
 ; SI-NEXT:    v_writelane_b32 v43, s4, 32
-; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s9, v35
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    v_writelane_b32 v43, s4, 33
 ; SI-NEXT:    v_readfirstlane_b32 s10, v36
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
 ; SI-NEXT:    v_writelane_b32 v43, s4, 34
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
 ; SI-NEXT:    v_writelane_b32 v43, s4, 35
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
 ; SI-NEXT:    v_writelane_b32 v43, s4, 36
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s69, v48
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s30, v49
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s16, v50
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s36, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
@@ -204103,7 +204340,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
 ; SI-NEXT:    v_writelane_b32 v43, s4, 37
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v52
 ; SI-NEXT:    v_writelane_b32 v43, s4, 38
 ; SI-NEXT:    v_readfirstlane_b32 s4, v53
@@ -204130,6 +204367,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI-NEXT:    v_writelane_b32 v43, s43, 58
 ; SI-NEXT:    v_writelane_b32 v43, s76, 59
 ; SI-NEXT:    v_writelane_b32 v43, s77, 60
+; SI-NEXT:    v_readfirstlane_b32 s93, v55
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_readfirstlane_b32 s95, v40
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s17, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
@@ -205698,53 +205938,33 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 8, v24
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 8, v26
-; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
-; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
-; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
-; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
-; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
-; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
-; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
-; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
-; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
-; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
-; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
-; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
-; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
-; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
-; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
-; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
-; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
-; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
-; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; VI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
 ; VI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
 ; VI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
 ; VI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; VI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b32_e32 v32, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v3
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v6
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshlrev_b32_e32 v36, 8, v2
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:248
 ; VI-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:256
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:264
@@ -205789,6 +206009,52 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:68
 ; VI-NEXT:    s_waitcnt vmcnt(10)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:132
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:140
+; VI-NEXT:    buffer_load_ushort v46, off, s[0:3], s32 offset:148
+; VI-NEXT:    buffer_load_ushort v47, off, s[0:3], s32 offset:156
+; VI-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:164
+; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:172
+; VI-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:180
+; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:188
+; VI-NEXT:    buffer_load_ushort v24, off, s[0:3], s32 offset:196
+; VI-NEXT:    buffer_load_ushort v34, off, s[0:3], s32 offset:204
+; VI-NEXT:    buffer_load_ushort v25, off, s[0:3], s32 offset:212
+; VI-NEXT:    buffer_load_ushort v57, off, s[0:3], s32 offset:220
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:228
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:236
+; VI-NEXT:    buffer_load_ushort v28, off, s[0:3], s32 offset:244
+; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:252
+; VI-NEXT:    buffer_load_ushort v27, off, s[0:3], s32 offset:260
+; VI-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:268
+; VI-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:276
+; VI-NEXT:    buffer_load_ushort v63, off, s[0:3], s32 offset:284
+; VI-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:292
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:300
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:308
+; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:316
+; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:324
 ; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
@@ -205808,7 +206074,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
@@ -205817,6 +206082,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
@@ -205848,25 +206114,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:76
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:84
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:92
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:100
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:108
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:116
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
 ; VI-NEXT:    s_cbranch_scc0 .LBB97_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
@@ -205891,18 +206138,15 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, v8
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
@@ -205952,11 +206196,10 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -205964,37 +206207,50 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v42, v43
 ; VI-NEXT:    v_mov_b32_e32 v43, v37
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
@@ -206009,12 +206265,13 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -206036,28 +206293,21 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v56, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v63, v39
-; VI-NEXT:    v_mov_b32_e32 v54, v33
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_mov_b32_e32 v57, v0
 ; VI-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -206075,10 +206325,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v53, v35
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v3, s4, v0
 ; VI-NEXT:    s_and_b32 s4, s16, 0xff
@@ -206111,6 +206362,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    s_branch .LBB97_3
 ; VI-NEXT:  .LBB97_2:
 ; VI-NEXT:    v_mov_b32_e32 v47, v54
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
@@ -206131,7 +206383,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; VI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v58, v7
 ; VI-NEXT:    v_mov_b32_e32 v57, v5
 ; VI-NEXT:    v_mov_b32_e32 v56, v3
@@ -206723,51 +206974,29 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v5, off, s[0:3], s32 offset:224
 ; GFX9-NEXT:    buffer_load_ushort v9, off, s[0:3], s32 offset:232
 ; GFX9-NEXT:    buffer_load_ushort v7, off, s[0:3], s32 offset:240
-; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
-; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
-; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
-; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
-; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
-; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
-; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
-; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
-; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
-; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
-; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
-; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
-; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
 ; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v13
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
 ; GFX9-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
@@ -206831,42 +207060,82 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_ushort v32, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:132
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_ushort v22, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:164
+; GFX9-NEXT:    buffer_load_ushort v59, off, s[0:3], s32 offset:172
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:180
+; GFX9-NEXT:    buffer_load_ushort v58, off, s[0:3], s32 offset:188
+; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:196
+; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:204
+; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:212
+; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:220
+; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:228
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:236
+; GFX9-NEXT:    buffer_load_ushort v56, off, s[0:3], s32 offset:244
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:252
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:260
+; GFX9-NEXT:    buffer_load_ushort v30, off, s[0:3], s32 offset:268
+; GFX9-NEXT:    buffer_load_ushort v31, off, s[0:3], s32 offset:276
+; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:284
+; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:292
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:300
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:308
+; GFX9-NEXT:    buffer_load_ushort v33, off, s[0:3], s32 offset:316
+; GFX9-NEXT:    buffer_load_ushort v62, off, s[0:3], s32 offset:324
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(22)
 ; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
 ; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(24)
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
 ; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
 ; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_waitcnt vmcnt(35)
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
@@ -206887,13 +207156,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(55)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_ushort v35, off, s[0:3], s32 offset:140
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:148
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_cbranch_scc0 .LBB97_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    s_and_b32 s4, s28, 0xff
@@ -207147,13 +207409,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    v_lshl_or_b32 v30, v1, 16, v0
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v31, v1, 16, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -207163,6 +207426,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:  .LBB97_2:
 ; GFX9-NEXT:    v_mov_b32_e32 v58, v50
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v59
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
@@ -207174,7 +207438,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v35
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_mov_b32_e32 v49, v39
@@ -207640,7 +207903,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -208370,7 +208633,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v30.h, v182.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v43.l
 ; GFX11-TRUE16-NEXT:  .LBB97_3: ; %end
-; GFX11-TRUE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0x1e
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -208412,7 +208675,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16_scalar:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x1e
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:440
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:436
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:432
@@ -209196,7 +209459,7 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v36
 ; GFX11-FAKE16-NEXT:  .LBB97_3: ; %end
-; GFX11-FAKE16-NEXT:    s_clause 0x1e ; 124-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x1e
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:320
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:324
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:328
@@ -209299,6 +209562,100 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:132
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:128
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; SI-NEXT:    ; kill: killed $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; kill: killed $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
@@ -209428,29 +209785,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209467,6 +209809,13 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
@@ -209521,39 +209870,12 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v62
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr18
@@ -209563,81 +209885,36 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; kill: killed $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:116
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v62
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:104
-; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v60
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v59
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v63
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:88
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:96
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:84
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:80
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:72
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:56
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v4
@@ -209659,18 +209936,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v47
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v57
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB98_2
@@ -211242,25 +211507,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v47
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -211284,12 +211533,44 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64i16_to_v128i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
 ; VI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
@@ -211307,22 +211588,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
 ; VI-NEXT:    ; kill: killed $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
 ; VI-NEXT:    ; kill: killed $vgpr35
@@ -211619,12 +211884,14 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v9, v8
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v5
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v7, v6
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v4
@@ -211656,6 +211923,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v3, v2
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v5, v4
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 24, v30
@@ -211726,16 +211997,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b64 v[44:45], 24, v[19:20]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, v18
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v5, v4
 ; VI-NEXT:    v_lshrrev_b64 v[41:42], 24, v[21:22]
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v1, v46
 ; VI-NEXT:    v_lshrrev_b64 v[45:46], 24, v[17:18]
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v35, 24, v26
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v24
 ; VI-NEXT:    v_lshrrev_b32_e32 v58, 24, v22
@@ -211936,6 +212201,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v15
 ; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[15:16]
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; VI-NEXT:    v_or_b32_e32 v13, v41, v13
@@ -211943,35 +212211,38 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v15, 8, v13
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
 ; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
+; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v10
 ; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, 8, v9
 ; VI-NEXT:    v_lshrrev_b64 v[9:10], 24, v[9:10]
+; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
 ; VI-NEXT:    v_lshrrev_b64 v[7:8], 24, v[7:8]
+; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
 ; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, 8, v5
 ; VI-NEXT:    v_lshrrev_b64 v[5:6], 24, v[5:6]
+; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; VI-NEXT:    v_lshrrev_b64 v[40:41], 24, v[3:4]
@@ -211984,6 +212255,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v36
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b64 v[1:2], 24, v[36:37]
+; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v30
@@ -212052,7 +212325,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v49, v53
 ; VI-NEXT:    v_mov_b32_e32 v53, v38
 ; VI-NEXT:    v_mov_b32_e32 v38, v55
-; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v32, 8, v18
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v17
 ; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
@@ -212064,13 +212336,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_mov_b32_e32 v55, v31
 ; VI-NEXT:    v_bfe_u32 v61, v53, 8, 8
 ; VI-NEXT:    v_bfe_u32 v31, v38, 8, 8
-; VI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; VI-NEXT:  .LBB98_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
@@ -212478,24 +212743,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -212518,12 +212768,44 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: bitcast_v64i16_to_v128i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -212586,23 +212868,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; kill: killed $vgpr50
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
@@ -212633,6 +212898,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
 ; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -212666,7 +212932,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(33)
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
 ; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; kill: killed $vgpr33
@@ -212789,100 +213055,101 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v32
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(62)
+; GFX9-NEXT:    s_waitcnt vmcnt(46)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v31
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v31
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v30
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v30
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v29
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v28
-; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v27
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v27
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v26
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v25
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v25
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v24
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v24
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v23
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v22
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v22
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v21
-; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v21
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 24, v20
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[13:14]
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
+; GFX9-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
+; GFX9-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[31:32]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[29:30]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[27:28]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[25:26]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[23:24]
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[21:22]
 ; GFX9-NEXT:    v_lshrrev_b64 v[40:41], 24, v[5:6]
 ; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
 ; GFX9-NEXT:    v_lshrrev_b64 v[41:42], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[53:54], 24, v[19:20]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v59, 8, v10
@@ -212898,7 +213165,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v58, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v44, 8, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v56, 24, v32
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 8, v20
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v19
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 8, v19
@@ -212923,7 +213189,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[33:34], 24, v[13:14]
 ; GFX9-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
 ; GFX9-NEXT:    v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
@@ -213464,7 +213730,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v56
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -213481,18 +213757,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -213951,11 +214215,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_clause 0x2
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
@@ -213976,6 +214236,10 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
 ; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
@@ -214611,7 +214875,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
 ; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-FAKE16-NEXT:    s_clause 0x13 ; 80-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_clause 0x13
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
 ; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
@@ -214750,18 +215014,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s91, v32
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s93, v33
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_readfirstlane_b32 s55, v34
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_readfirstlane_b32 s17, v35
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_readfirstlane_b32 s95, v36
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_readfirstlane_b32 s35, v37
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
@@ -214771,6 +215023,18 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s55, v34
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s17, v35
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s95, v36
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s35, v37
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s83, v38
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80
 ; SI-NEXT:    v_readfirstlane_b32 s65, v7
@@ -214782,34 +215046,39 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI-NEXT:    v_readfirstlane_b32 s39, v1
 ; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s77, v31
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s38, v32
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s48, v33
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s50, v39
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s76, v48
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s30, v49
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s34, v50
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s36, v51
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s99, v34
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s90, v35
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s92, v36
 ; SI-NEXT:    v_writelane_b32 v41, s90, 11
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_readfirstlane_b32 s94, v37
 ; SI-NEXT:    v_writelane_b32 v41, s92, 12
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_readfirstlane_b32 s30, v49
 ; SI-NEXT:    v_writelane_b32 v41, s94, 13
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_readfirstlane_b32 s34, v50
 ; SI-NEXT:    v_writelane_b32 v41, s30, 14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_readfirstlane_b32 s36, v51
 ; SI-NEXT:    v_writelane_b32 v41, s34, 15
 ; SI-NEXT:    v_writelane_b32 v41, s36, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v38
 ; SI-NEXT:    v_writelane_b32 v41, s38, 17
-; SI-NEXT:    v_readfirstlane_b32 s76, v48
-; SI-NEXT:    v_readfirstlane_b32 s99, v34
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_writelane_b32 v41, s48, 18
 ; SI-NEXT:    v_writelane_b32 v41, s50, 19
@@ -217791,48 +218060,48 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
 ; GFX9-NEXT:    v_pk_add_u16 v12, s41, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, s40, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[11:12]
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
 ; GFX9-NEXT:    v_pk_add_u16 v14, s43, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v13, s42, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[13:14]
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
-; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
 ; GFX9-NEXT:    v_pk_add_u16 v22, s45, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v21, s44, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
+; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v3
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[21:22]
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
-; GFX9-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v26
@@ -218484,11 +218753,9 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v35, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:16
@@ -218520,7 +218787,10 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v33, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 8, v18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v15, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -218771,6 +219041,8 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v24
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -218785,8 +219057,6 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
@@ -218798,7 +219068,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:76
 ; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:80
 ; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:84
@@ -218833,7 +219103,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_writelane_b32 v76, s101, 5
 ; GFX11-NEXT:    s_mov_b32 s99, 0
 ; GFX11-NEXT:    s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x12
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:72
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:68
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:64
@@ -219752,7 +220022,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    scratch_store_b128 v0, v[11:14], off offset:80
 ; GFX11-NEXT:    scratch_store_b128 v0, v[7:10], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    s_clause 0x12 ; 76-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x12
 ; GFX11-NEXT:    scratch_load_b32 v74, off, s32
 ; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:8
@@ -219814,7 +220084,7 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; GFX11-NEXT:    v_readlane_b32 s31, v75, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v75, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:76
 ; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:80
 ; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:84
@@ -220832,24 +221102,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -221194,14 +221449,28 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v64f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -221218,7 +221487,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -221467,6 +221738,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -221832,9 +222104,6 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64bf16_to_v64f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -221851,7 +222120,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -222070,7 +222341,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; GFX9-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX9-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -222370,7 +222641,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:68
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:64
 ; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:60
@@ -222930,7 +223201,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v42 :: v_dual_mov_b32 v11, v43
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, v44 :: v_dual_mov_b32 v13, v45
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v14, v46 :: v_dual_mov_b32 v15, v47
-; GFX11-TRUE16-NEXT:    s_clause 0xf ; 64-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_clause 0xf
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:8
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:12
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:16
@@ -224659,26 +224930,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -224711,6 +224966,22 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -228691,6 +228962,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v47
+; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -228707,7 +228979,6 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -230050,7 +230321,20 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v37
 ; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v43
+; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v37
+; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
+; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v37
+; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -230067,19 +230351,6 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg %
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v26
-; SI-NEXT:    v_alignbit_b32 v0, v0, v1, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x7c, v37
-; SI-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -231127,32 +231398,17 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v13
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v2, v21, v2, 16
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
-; SI-NEXT:    v_alignbit_b32 v1, v61, v1, 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; SI-NEXT:    v_alignbit_b32 v5, v23, v5, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v2, v21, v2, 16
+; SI-NEXT:    v_alignbit_b32 v1, v61, v1, 16
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
@@ -231162,63 +231418,57 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v7, v24, v7, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v17
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
 ; SI-NEXT:    v_alignbit_b32 v25, v45, v8, 16
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v9
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v8, v25, v8, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
 ; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_alignbit_b32 v62, v63, v16, 16
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v33
 ; SI-NEXT:    v_alignbit_b32 v16, v62, v16, 16
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
 ; SI-NEXT:    v_alignbit_b32 v22, v34, v9, 16
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v11
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    v_alignbit_b32 v9, v22, v9, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
@@ -231276,8 +231526,31 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v15, v41, v15, 16
 ; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v20, v6, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v2, v19, v3, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v1, v18, v4, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB104_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
@@ -231535,23 +231808,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v61
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
@@ -231576,14 +231833,28 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v64bf16_to_v64i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -231600,7 +231871,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -231849,6 +232122,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; VI-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; VI-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; VI-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -232214,9 +232488,6 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v64bf16_to_v64i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -232233,7 +232504,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -232452,7 +232725,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_e32 v40, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v55, v40, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v55, 16, v31
 ; GFX9-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
 ; GFX9-NEXT:    v_bfe_u32 v40, v55, 16, 1
@@ -234057,21 +234330,15 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v57, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v34
+; SI-NEXT:    v_mov_b32_e32 v57, v13
 ; SI-NEXT:    v_mov_b32_e32 v40, v3
 ; SI-NEXT:    v_mov_b32_e32 v54, v50
 ; SI-NEXT:    v_mov_b32_e32 v46, v19
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
 ; SI-NEXT:    v_mov_b32_e32 v44, v15
 ; SI-NEXT:    v_mov_b32_e32 v9, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v59
@@ -234105,24 +234372,32 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v42, v43
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(1)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v50
-; SI-NEXT:    v_mov_b32_e32 v5, v19
-; SI-NEXT:    v_mov_b32_e32 v7, v15
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; SI-NEXT:    v_mov_b32_e32 v5, v19
+; SI-NEXT:    v_mov_b32_e32 v7, v15
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(1)
@@ -234258,7 +234533,9 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v54, v50
+; SI-NEXT:    v_mov_b32_e32 v56, v47
 ; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v53, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
@@ -234266,8 +234543,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v56, v47
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v40, v3
 ; SI-NEXT:    v_mov_b32_e32 v44, v15
 ; SI-NEXT:    v_mov_b32_e32 v57, v13
@@ -234575,18 +234850,16 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshr_b64 v[51:52], v[25:26], 16
 ; SI-NEXT:    v_lshr_b64 v[52:53], v[1:2], 16
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
 ; SI-NEXT:    v_alignbit_b32 v16, v45, v16, 16
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
 ; SI-NEXT:    v_alignbit_b32 v28, v58, v27, 16
@@ -234644,16 +234917,19 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
 ; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
 ; SI-NEXT:  .LBB105_5: ; %end
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -234679,8 +234955,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
@@ -234707,11 +234985,12 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
@@ -234755,25 +235034,7 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -234788,10 +235049,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
@@ -234806,8 +235067,10 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
@@ -234815,18 +235078,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
@@ -234834,18 +235096,17 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
@@ -234924,6 +235185,22 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -238138,23 +238415,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v28
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
@@ -238319,6 +238580,22 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, 16
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -239903,39 +240180,38 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:88
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v7
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v10
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v14
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v23
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v25
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v24
@@ -239946,6 +240222,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
 ; SI-NEXT:    v_cvt_f16_f32_e32 v21, v47
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v38
@@ -240527,6 +240804,16 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -240543,16 +240830,6 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -241023,12 +241300,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -241040,24 +241315,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
-; SI-NEXT:    v_mov_b32_e32 v54, v15
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v42
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -241067,13 +241325,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v26, v3, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
@@ -241082,22 +241335,39 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v22, v3, v5
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v18, v3, v5
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v54, v15
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v42
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
@@ -241115,6 +241385,8 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v14, v3, v5
@@ -241158,6 +241430,11 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
@@ -241294,27 +241571,27 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v12, v50, v1
 ; SI-NEXT:    v_lshr_b64 v[49:50], v[35:36], 16
+; SI-NEXT:    v_mov_b32_e32 v35, v44
+; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
 ; SI-NEXT:    v_lshr_b64 v[50:51], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[42:43], 16
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[9:10], 16
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v35, v44
-; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
-; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[40:41], 16
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
 ; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
 ; SI-NEXT:    v_mov_b32_e32 v42, v61
 ; SI-NEXT:    v_mov_b32_e32 v61, v37
@@ -241408,18 +241685,17 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -241451,9 +241727,7 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
@@ -241468,10 +241742,10 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
@@ -241484,24 +241758,25 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
@@ -241530,25 +241805,9 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -241566,6 +241825,22 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -242603,24 +242878,9 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -242965,6 +243225,22 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -244047,8 +244323,15 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
@@ -244065,13 +244348,6 @@ define inreg <64 x half> @bitcast_v64i16_to_v64f16_scalar(<64 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index e688681c5ad09..9041f64cb17fb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -15670,25 +15670,8 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v29
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v28
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v40
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -15698,6 +15681,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v55
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v40
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -15727,6 +15711,22 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15947,16 +15947,16 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17964,6 +17964,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v20i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, v10
 ; VI-NEXT:    v_mov_b32_e32 v33, v8
 ; VI-NEXT:    v_mov_b32_e32 v35, v6
@@ -17980,14 +17988,6 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
@@ -18005,15 +18005,17 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18044,7 +18046,7 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -18099,14 +18101,14 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
 ; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v54
 ; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v53
 ; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v51
 ; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v30
@@ -21932,14 +21934,6 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; SI-NEXT:    v_bfe_u32 v29, v1, 8, 8
 ; SI-NEXT:  .LBB60_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v32
 ; SI-NEXT:    v_and_b32_e32 v30, 0xff, v30
@@ -22058,6 +22052,14 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23916,6 +23918,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v20f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v4
 ; SI-NEXT:    v_mov_b32_e32 v31, v2
 ; SI-NEXT:    v_mov_b32_e32 v35, v0
@@ -23929,18 +23943,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v5
@@ -23972,16 +23974,20 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 8, v4
 ; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v32
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 8, v33
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v34
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
@@ -24021,7 +24027,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v30
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v6
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v50
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v32, v6
@@ -24099,17 +24105,18 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v59, v0
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x300, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v53
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    s_movk_i32 s6, 0x300
 ; SI-NEXT:    v_or_b32_e32 v0, v58, v0
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, s6, v0
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v51
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, s6, v0
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v56, v0
@@ -24225,6 +24232,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v20f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v34, v10
 ; VI-NEXT:    v_mov_b32_e32 v33, v8
 ; VI-NEXT:    v_mov_b32_e32 v35, v6
@@ -24241,14 +24256,6 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v31, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
@@ -24266,15 +24273,17 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(11)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24305,7 +24314,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24360,14 +24369,14 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x300
 ; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(10)
+; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v54
 ; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v53
 ; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v51
 ; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v30
@@ -28243,6 +28252,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v5f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
 ; SI-NEXT:    v_mov_b32_e32 v35, v8
 ; SI-NEXT:    v_mov_b32_e32 v34, v6
@@ -28259,15 +28277,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -28286,14 +28295,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28356,7 +28368,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v23
@@ -28496,7 +28508,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
@@ -28545,6 +28557,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v5f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v36, v10
 ; VI-NEXT:    v_mov_b32_e32 v35, v8
 ; VI-NEXT:    v_mov_b32_e32 v34, v6
@@ -28561,15 +28582,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v38, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -28587,14 +28599,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28625,7 +28640,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28733,7 +28748,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v8, 3, v50
 ; VI-NEXT:    v_add_u16_e32 v10, 3, v49
 ; VI-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -28765,6 +28780,15 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v40i8_to_v5f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v6
@@ -28781,16 +28805,6 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -28808,17 +28822,17 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28849,7 +28863,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28957,7 +28971,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v8, 3, v50
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v49
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32287,6 +32301,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i8_to_v5i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
 ; SI-NEXT:    v_mov_b32_e32 v35, v8
 ; SI-NEXT:    v_mov_b32_e32 v34, v6
@@ -32303,15 +32326,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -32330,14 +32344,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v6
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v8
-; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v10
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32400,7 +32417,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v49
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v23
@@ -32540,7 +32557,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v25, v8
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v50
 ; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v49
@@ -32589,6 +32606,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-LABEL: bitcast_v40i8_to_v5i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v36, v10
 ; VI-NEXT:    v_mov_b32_e32 v35, v8
 ; VI-NEXT:    v_mov_b32_e32 v34, v6
@@ -32605,15 +32631,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v38, v14
 ; VI-NEXT:    v_mov_b32_e32 v37, v12
 ; VI-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -32631,14 +32648,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; VI-NEXT:    s_waitcnt vmcnt(13)
+; VI-NEXT:    s_waitcnt vmcnt(4)
 ; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; VI-NEXT:    s_waitcnt vmcnt(12)
+; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32669,7 +32689,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -32777,7 +32797,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(9)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_u16_e32 v8, 3, v50
 ; VI-NEXT:    v_add_u16_e32 v10, 3, v49
 ; VI-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -32809,6 +32829,15 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-LABEL: bitcast_v40i8_to_v5i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v10
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v8
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v6
@@ -32825,16 +32854,6 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v48, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    buffer_load_ushort v50, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v38, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v37, v12
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v56, 8, v1
@@ -32852,17 +32871,17 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(13)
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(12)
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v10
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -32893,7 +32912,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -33001,7 +33020,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
 ; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u16_e32 v8, 3, v50
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v49
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 39da45b3e5063..ee23420c2a662 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2406,13 +2406,13 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v16i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -2435,9 +2435,9 @@ define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -8424,22 +8424,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -8475,6 +8459,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8757,22 +8757,6 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -8806,6 +8790,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9077,25 +9077,9 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -9122,6 +9106,22 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11440,6 +11440,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -11448,11 +11453,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -11484,6 +11484,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -11722,6 +11723,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -11970,11 +11972,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12014,9 +12016,16 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12026,13 +12035,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12042,10 +12044,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12208,7 +12211,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12218,6 +12221,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12424,11 +12428,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -12472,9 +12476,16 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -12484,13 +12495,6 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -12500,10 +12504,11 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -12666,7 +12671,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -12676,6 +12681,7 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -17317,13 +17323,13 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v16f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -17346,9 +17352,9 @@ define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -23309,22 +23315,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -23360,6 +23350,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23642,22 +23648,6 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -23691,6 +23681,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -23962,25 +23968,9 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -24007,6 +23997,22 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25434,21 +25440,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_readlane_b32 s67, v63, 19
 ; VI-NEXT:    v_readlane_b32 s66, v63, 18
 ; VI-NEXT:    v_readlane_b32 s65, v63, 17
@@ -25469,7 +25460,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; VI-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25499,6 +25490,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -25867,21 +25873,6 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
 ; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
 ; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
@@ -25898,7 +25889,7 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -25924,6 +25915,21 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -26446,6 +26452,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -26454,11 +26465,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -26490,6 +26496,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -26728,6 +26735,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -26976,11 +26984,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27020,9 +27028,16 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27032,13 +27047,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -27048,10 +27056,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27214,7 +27223,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27224,6 +27233,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27430,11 +27440,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -27478,9 +27488,16 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -27490,13 +27507,6 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -27506,10 +27516,11 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -27672,7 +27683,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -27682,6 +27693,7 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -31676,13 +31688,13 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v8i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -31705,9 +31717,9 @@ define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -37702,22 +37714,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -37753,6 +37749,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38035,22 +38047,6 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -38084,6 +38080,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38355,25 +38367,9 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -38400,6 +38396,22 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -40728,6 +40740,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -40736,11 +40753,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -40772,6 +40784,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -41010,6 +41023,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -41258,11 +41272,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41302,9 +41316,16 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41314,13 +41335,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -41330,10 +41344,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41496,7 +41511,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41506,6 +41521,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41712,11 +41728,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -41760,9 +41776,16 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -41772,13 +41795,6 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -41788,10 +41804,11 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -41954,7 +41971,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -41964,6 +41981,7 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -45299,13 +45317,13 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32i16_to_v8f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v32, v2
 ; SI-NEXT:    v_mov_b32_e32 v31, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v38, v14
 ; SI-NEXT:    v_mov_b32_e32 v37, v12
 ; SI-NEXT:    v_mov_b32_e32 v36, v10
@@ -45328,9 +45346,9 @@ define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -51147,22 +51165,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -51198,6 +51200,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51472,22 +51490,6 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -51521,6 +51523,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51784,25 +51802,9 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -51829,6 +51831,22 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -54170,6 +54188,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:100
 ; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:92
@@ -54178,11 +54201,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:124
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -54214,6 +54232,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v58
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    v_or_b32_e32 v12, v54, v12
@@ -54452,6 +54471,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v11, v43, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
@@ -54700,11 +54720,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; VI-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; VI-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; VI-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; VI-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; VI-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; VI-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -54744,9 +54764,16 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; VI-NEXT:    s_waitcnt vmcnt(9)
 ; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    s_waitcnt vmcnt(8)
 ; VI-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; VI-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -54756,13 +54783,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; VI-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; VI-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; VI-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; VI-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -54772,10 +54792,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -54938,7 +54959,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; VI-NEXT:    v_mov_b32_e32 v15, 0x300
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(14)
 ; VI-NEXT:    v_add_u16_e32 v9, 3, v40
 ; VI-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -54948,6 +54969,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v11, 3, v23
 ; VI-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(5)
 ; VI-NEXT:    v_add_u16_e32 v12, 3, v38
 ; VI-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55154,11 +55176,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:112
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:120
 ; GFX9-NEXT:    buffer_load_ushort v45, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v20, 8, v19
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v22, 8, v21
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v24, 8, v23
+; GFX9-NEXT:    v_lshlrev_b16_e32 v28, 8, v25
+; GFX9-NEXT:    v_lshlrev_b16_e32 v30, 8, v27
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v63, 8, v29
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v18, 8, v17
 ; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:124
@@ -55202,9 +55224,16 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v53
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v40
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(9)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v41
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v60, 8, v45
 ; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:108
 ; GFX9-NEXT:    buffer_load_ushort v41, off, s[0:3], s32 offset:100
@@ -55214,13 +55243,6 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_ushort v61, off, s[0:3], s32 offset:68
 ; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:60
 ; GFX9-NEXT:    buffer_load_ushort v26, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_load_ushort v23, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_load_ushort v29, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v49, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v52, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v40, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v43, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_load_ushort v19, off, s[0:3], s32 offset:116
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -55230,10 +55252,11 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v40, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v10, v49, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v23, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v38, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v13, v58, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v14, v45, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -55396,7 +55419,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
 ; GFX9-NEXT:    v_add_u16_e32 v9, 3, v40
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v57, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -55406,6 +55429,7 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v11, 3, v23
 ; GFX9-NEXT:    v_or_b32_sdwa v11, v42, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
 ; GFX9-NEXT:    v_add_u16_e32 v12, 3, v38
 ; GFX9-NEXT:    v_or_b32_sdwa v12, v54, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -60556,8 +60580,6 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32bf16_to_v32i16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -60574,6 +60596,8 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v0
 ; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v1
@@ -60637,8 +60661,9 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v55
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -64301,6 +64326,18 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v62
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -64317,18 +64354,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -64446,44 +64471,44 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v14
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v14
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v13
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v12
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v12
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v11
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v10
-; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v8
-; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v8
-; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v7
-; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v6
-; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b32_e32 v50, 24, v16
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[15:16]
 ; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v17, 8, v5
+; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[13:14]
+; VI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; VI-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; VI-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; VI-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; VI-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; VI-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; VI-NEXT:    v_lshrrev_b32_e32 v50, 24, v16
 ; VI-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
 ; VI-NEXT:    v_lshrrev_b32_e32 v42, 8, v9
 ; VI-NEXT:    v_lshrrev_b32_e32 v43, 24, v6
-; VI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; VI-NEXT:    v_lshrrev_b32_e32 v46, 24, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v55, 8, v4
 ; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
@@ -64780,6 +64805,8 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -64796,8 +64823,6 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -65069,25 +65094,9 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -65114,6 +65123,22 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -67218,21 +67243,6 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
 ; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
 ; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
@@ -67249,7 +67259,7 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -67275,6 +67285,21 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -67743,61 +67768,17 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:84
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:100
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v23
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v21
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 24, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v27
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; kill: killed $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; kill: killed $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr55
@@ -67812,24 +67793,25 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 8, v33
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v34
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v35
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 8, v36
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v37
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v25
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v39
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 8, v48
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v49
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
 ; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
@@ -67837,6 +67819,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
@@ -67850,8 +67833,57 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v15
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v11
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v23
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v29
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v49
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v50
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; kill: killed $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; kill: killed $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -67860,6 +67892,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xff, v58
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v26
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
@@ -68140,6 +68173,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB98_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v18
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
@@ -68164,6 +68198,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v3, v59, v3
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, s7, v1
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v8
@@ -68187,6 +68222,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v54, vcc, s7, v0
 ; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v57
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -68394,6 +68430,8 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB98_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -68410,8 +68448,6 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, v37
 ; SI-NEXT:    v_mov_b32_e32 v2, v48
@@ -68422,6 +68458,7 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_mov_b32_e32 v12, v32
 ; SI-NEXT:    v_mov_b32_e32 v14, v51
 ; SI-NEXT:    v_mov_b32_e32 v16, v34
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v18, v52
 ; SI-NEXT:    v_mov_b32_e32 v20, v36
 ; SI-NEXT:    v_mov_b32_e32 v22, v53
@@ -70159,12 +70196,13 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v46, v30
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
@@ -70181,7 +70219,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_readfirstlane_b32 s43, v1
 ; SI-NEXT:    v_readfirstlane_b32 s42, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
@@ -70205,19 +70242,19 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v48
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v39
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v37
 ; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v37
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v30
-; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v31
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v33
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v34
 ; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
@@ -70243,7 +70280,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(11) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v60, v44
 ; SI-NEXT:    v_or_b32_e32 v44, v53, v9
 ; SI-NEXT:    v_or_b32_e32 v33, v1, v44
@@ -70688,6 +70725,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:  .LBB99_3: ; %end
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -70704,12 +70747,6 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -70721,13 +70758,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v6, s5
 ; SI-NEXT:    v_mov_b32_e32 v7, s11
 ; SI-NEXT:    v_mov_b32_e32 v8, v37
-; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v10, v38
 ; SI-NEXT:    v_mov_b32_e32 v12, v33
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v14, v34
 ; SI-NEXT:    v_mov_b32_e32 v16, v48
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v18, v49
 ; SI-NEXT:    v_mov_b32_e32 v20, v35
 ; SI-NEXT:    v_mov_b32_e32 v22, v36
@@ -70735,6 +70770,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    v_mov_b32_e32 v26, v51
 ; SI-NEXT:    v_mov_b32_e32 v28, v54
 ; SI-NEXT:    v_mov_b32_e32 v30, v55
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
 ; SI-NEXT:    v_mov_b32_e32 v39, v32
@@ -72152,8 +72188,6 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32f16_to_v32bf16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -72170,6 +72204,8 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v2
@@ -72237,8 +72273,9 @@ define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr31
@@ -76957,24 +76994,8 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v9
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -77002,6 +77023,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -77314,23 +77351,7 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
 ; VI-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; VI-NEXT:    v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77354,6 +77375,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -77626,25 +77663,9 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -77671,6 +77692,22 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -79126,12 +79163,13 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    ; implicit-def: $sgpr75
 ; VI-NEXT:    s_branch .LBB105_2
 ; VI-NEXT:  .LBB105_4:
+; VI-NEXT:    v_mov_b32_e32 v1, s58
 ; VI-NEXT:    v_mov_b32_e32 v53, s56
 ; VI-NEXT:    v_mov_b32_e32 v52, s42
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; VI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v52, s44
-; VI-NEXT:    v_mov_b32_e32 v1, s58
 ; VI-NEXT:    v_mov_b32_e32 v19, s67
 ; VI-NEXT:    v_mov_b32_e32 v12, s66
 ; VI-NEXT:    v_mov_b32_e32 v20, s65
@@ -79177,7 +79215,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_mov_b32_e32 v45, s78
 ; VI-NEXT:    v_mov_b32_e32 v42, s76
 ; VI-NEXT:    v_mov_b32_e32 v55, s74
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; VI-NEXT:    v_mov_b32_e32 v54, s57
 ; VI-NEXT:    v_mov_b32_e32 v41, s59
 ; VI-NEXT:    v_mov_b32_e32 v44, s60
@@ -79283,21 +79320,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v54
 ; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_readlane_b32 s67, v63, 19
 ; VI-NEXT:    v_readlane_b32 s66, v63, 18
 ; VI-NEXT:    v_readlane_b32 s65, v63, 17
@@ -79318,7 +79340,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79350,6 +79372,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -79719,21 +79756,6 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s55, v63, 15
 ; GFX9-NEXT:    v_readlane_b32 s54, v63, 14
 ; GFX9-NEXT:    v_readlane_b32 s53, v63, 13
@@ -79750,7 +79772,7 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_readlane_b32 s34, v63, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_waitcnt vmcnt(16)
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v33, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -79776,6 +79798,21 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -80249,14 +80286,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
@@ -80331,10 +80360,19 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:12
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v31
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v33
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v34
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 8, v37
@@ -80352,7 +80390,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xff, v55
 ; SI-NEXT:    v_or_b32_e32 v16, v19, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v34, v16
@@ -80365,6 +80403,7 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v12, 0xff, v18
 ; SI-NEXT:    v_or_b32_e32 v10, v12, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v21, v10
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xff, v41
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v8
@@ -80389,7 +80428,6 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v56
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v29, v0
-; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xff, v6
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v46
 ; SI-NEXT:    v_cvt_f32_f16_e32 v54, v0
@@ -80596,12 +80634,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB106_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_or_b32_e32 v7, v3, v7
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v47
 ; SI-NEXT:    v_or_b32_e32 v6, v46, v6
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
@@ -80609,10 +80648,12 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v9, v35, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s6, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s6, v7
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v42
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v39, v7
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, s6, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v41
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
@@ -80811,6 +80852,13 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v31, v1
 ; SI-NEXT:  .LBB106_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
@@ -80827,21 +80875,14 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v8, v33
 ; SI-NEXT:    v_mov_b32_e32 v10, v37
 ; SI-NEXT:    v_mov_b32_e32 v12, v49
 ; SI-NEXT:    v_mov_b32_e32 v14, v53
 ; SI-NEXT:    v_mov_b32_e32 v16, v32
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v18, v34
 ; SI-NEXT:    v_mov_b32_e32 v20, v36
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_mov_b32_e32 v22, v38
 ; SI-NEXT:    v_mov_b32_e32 v24, v48
 ; SI-NEXT:    v_mov_b32_e32 v26, v50
@@ -84420,6 +84461,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-LABEL: bitcast_v32bf16_to_v64i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32
@@ -84485,22 +84542,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v4
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v3
@@ -84564,9 +84605,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v37
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v48
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v50
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr50
@@ -85177,24 +85220,8 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
@@ -85222,6 +85249,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -85777,22 +85820,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v18
@@ -85826,6 +85853,22 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -86357,10 +86400,21 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
 ; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v41
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
+; GFX9-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
+; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -86377,17 +86431,6 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v55
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v54
-; GFX9-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -88002,26 +88045,10 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v54
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v42
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    s_lshl_b32 s4, s76, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v42
 ; SI-NEXT:    v_or_b32_e32 v3, s4, v3
 ; SI-NEXT:    s_and_b32 s4, s74, 0xff
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -88049,6 +88076,22 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
@@ -88762,21 +88805,6 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v34
 ; VI-NEXT:    v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_readlane_b32 s67, v63, 19
@@ -88799,7 +88827,7 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_readlane_b32 s34, v63, 2
 ; VI-NEXT:    v_readlane_b32 s31, v63, 1
 ; VI-NEXT:    v_readlane_b32 s30, v63, 0
-; VI-NEXT:    s_waitcnt vmcnt(14)
+; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -88829,6 +88857,21 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 60, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; VI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; VI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[4:5]
@@ -90386,8 +90429,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:92
 ; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 8, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v27
@@ -90417,30 +90458,28 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v17
+; SI-NEXT:    s_waitcnt vmcnt(13)
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v20
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v28
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v31
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v32
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v33
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v34
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v36
 ; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -90457,6 +90496,8 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:124
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
@@ -90472,8 +90513,16 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v19
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:40
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v23
 ; SI-NEXT:    ; kill: killed $vgpr3
@@ -90754,6 +90803,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB110_4
 ; SI-NEXT:  ; %bb.3: ; %cmp.true
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v45
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v16
@@ -90779,6 +90829,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v58, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, s7, v3
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v60
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v26
@@ -90790,6 +90841,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v46, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, s7, v3
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v49
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v59
@@ -90802,6 +90854,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v5, v12, v5
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, s7, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v47
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v20
@@ -90815,6 +90868,7 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, s7, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v44
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v28
 ; SI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v24
@@ -91032,8 +91086,11 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
 ; SI-NEXT:  .LBB110_4: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v2, v43
 ; SI-NEXT:    v_mov_b32_e32 v10, v41
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_mov_b32_e32 v28, v40
 ; SI-NEXT:    v_mov_b32_e32 v30, v42
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
@@ -91052,8 +91109,6 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    v_mov_b32_e32 v4, v33
 ; SI-NEXT:    v_mov_b32_e32 v6, v39
 ; SI-NEXT:    v_mov_b32_e32 v8, v51
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index 46911e7934429..5d4df4bde1af8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -4938,13 +4938,6 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -5044,6 +5037,13 @@ define <36 x half> @bitcast_v18i32_to_v36f16(<18 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6164,14 +6164,6 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v18i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -6188,28 +6180,36 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -6224,12 +6224,14 @@ define <18 x i32> @bitcast_v36f16_to_v18i32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12069,13 +12071,6 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -12175,6 +12170,13 @@ define <36 x half> @bitcast_v18f32_to_v36f16(<18 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -13433,14 +13435,6 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v18f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -13457,28 +13451,36 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -13493,12 +13495,14 @@ define <18 x float> @bitcast_v36f16_to_v18f32(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -18416,13 +18420,6 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -18522,6 +18519,13 @@ define <36 x half> @bitcast_v9i64_to_v36f16(<9 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -19652,14 +19656,6 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v9i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -19676,28 +19672,36 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -19712,12 +19716,14 @@ define <9 x i64> @bitcast_v36f16_to_v9i64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -23972,13 +23978,6 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
@@ -24078,6 +24077,13 @@ define <36 x half> @bitcast_v9f64_to_v36f16(<9 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25276,14 +25282,6 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36f16_to_v9f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
@@ -25300,28 +25298,36 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:20
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v62, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v56, v10
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v47, v13
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    v_cvt_f16_f32_e32 v46, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v44, v14
@@ -25336,12 +25342,14 @@ define <9 x double> @bitcast_v36f16_to_v9f64(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v27
-; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v36
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -26790,6 +26798,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v36i16_to_v36f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:12
@@ -26814,22 +26838,6 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr62
 ; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr63
@@ -26857,7 +26865,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; kill: killed $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -26884,7 +26892,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_f16_e32 v47, v9
 ; SI-NEXT:    v_cvt_f32_f16_e32 v60, v10
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
 ; SI-NEXT:    v_cvt_f32_f16_e32 v45, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v58, v12
@@ -26969,6 +26977,7 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
 ; SI-NEXT:    v_add_i32_e32 v34, vcc, 3, v34
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -27138,24 +27147,8 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v52
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -27210,6 +27203,22 @@ define <36 x half> @bitcast_v36i16_to_v36f16(<36 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27751,17 +27760,6 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -27844,6 +27842,17 @@ define inreg <36 x half> @bitcast_v36i16_to_v36f16_scalar(<36 x i16> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -28700,12 +28709,6 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v9, v1, v9, 16
 ; SI-NEXT:  .LBB58_2: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v50
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v21
@@ -28807,6 +28810,12 @@ define <36 x i16> @bitcast_v36f16_to_v36i16(<36 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29412,15 +29421,9 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -29490,6 +29493,12 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index 6749daba296c5..44cfd6c28ca6a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -3541,17 +3541,6 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v20i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -3573,6 +3562,17 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -3594,10 +3594,13 @@ define <20 x i32> @bitcast_v40i16_to_v20i32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -5554,23 +5557,10 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -5666,6 +5656,19 @@ define <40 x half> @bitcast_v20i32_to_v40f16(<20 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11737,17 +11740,6 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v20f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -11769,6 +11761,17 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -11790,10 +11793,13 @@ define <20 x float> @bitcast_v40i16_to_v20f32(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -13750,23 +13756,10 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -13862,6 +13855,19 @@ define <40 x half> @bitcast_v20f32_to_v40f16(<20 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14429,10 +14435,6 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:  .LBB33_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
@@ -14571,6 +14573,9 @@ define inreg <40 x half> @bitcast_v20f32_to_v40f16_scalar(<20 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -19244,17 +19249,6 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v10i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -19276,6 +19270,17 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -19297,10 +19302,13 @@ define <10 x i64> @bitcast_v40i16_to_v10i64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -21257,23 +21265,10 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -21369,6 +21364,19 @@ define <40 x half> @bitcast_v10i64_to_v40f16(<10 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25980,17 +25988,6 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v10f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
@@ -26012,6 +26009,17 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
 ; SI-NEXT:    v_mov_b32_e32 v37, v20
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_mov_b32_e32 v39, v16
@@ -26033,10 +26041,13 @@ define <10 x double> @bitcast_v40i16_to_v10f64(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
 ; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
@@ -27973,23 +27984,10 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -28085,6 +28083,19 @@ define <40 x half> @bitcast_v10f64_to_v40f16(<10 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -28624,11 +28635,6 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
 ; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
 ; SI-NEXT:  .LBB53_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
@@ -28767,6 +28773,10 @@ define inreg <40 x half> @bitcast_v10f64_to_v40f16_scalar(<10 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -31379,17 +31389,6 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v40i16_to_v40f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
@@ -31406,6 +31405,17 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:8
 ; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
@@ -31462,7 +31472,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; kill: killed $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -31513,6 +31523,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v40, v48
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
@@ -31612,6 +31623,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 3, v39
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_add_i32_e32 v49, vcc, 3, v49
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -31631,6 +31643,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_add_i32_e32 v48, vcc, 3, v48
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
@@ -31837,23 +31850,7 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -31891,6 +31888,22 @@ define <40 x half> @bitcast_v40i16_to_v40f16(<40 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -32586,6 +32599,11 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -32602,11 +32620,6 @@ define inreg <40 x half> @bitcast_v40i16_to_v40f16_scalar(<40 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v48
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -33561,20 +33574,8 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
 ; SI-NEXT:    v_alignbit_b32 v19, v5, v29, 16
 ; SI-NEXT:  .LBB58_2: ; %end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -33689,6 +33690,18 @@ define <40 x i16> @bitcast_v40f16_to_v40i16(<40 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34404,18 +34417,6 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v26
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
@@ -34450,6 +34451,18 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 6b13e96d73999..87d5157b3c340 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3792,17 +3792,6 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v22i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -3825,6 +3814,17 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -3842,8 +3842,9 @@ define <22 x i32> @bitcast_v44i16_to_v22i32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -6117,24 +6118,8 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -6156,6 +6141,22 @@ define <44 x half> @bitcast_v22i32_to_v44f16(<22 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12754,17 +12755,6 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v22f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -12787,6 +12777,17 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -12804,8 +12805,9 @@ define <22 x float> @bitcast_v44i16_to_v22f32(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -15079,24 +15081,8 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -15118,6 +15104,22 @@ define <44 x half> @bitcast_v22f32_to_v44f16(<22 x float> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15742,15 +15744,6 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
 ; SI-NEXT:  .LBB33_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
@@ -15903,6 +15896,14 @@ define inreg <44 x half> @bitcast_v22f32_to_v44f16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -21003,17 +21004,6 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v11i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -21036,6 +21026,17 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -21053,8 +21054,9 @@ define <11 x i64> @bitcast_v44i16_to_v11i64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -23328,24 +23330,8 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -23367,6 +23353,22 @@ define <44 x half> @bitcast_v11i64_to_v44f16(<11 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -28418,17 +28420,6 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v44i16_to_v11f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v39, v16
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
@@ -28451,6 +28442,17 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v16
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_mov_b32_e32 v38, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v3
@@ -28468,8 +28470,9 @@ define <11 x double> @bitcast_v44i16_to_v11f64(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
@@ -30721,24 +30724,8 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -30760,6 +30747,22 @@ define <44 x half> @bitcast_v11f64_to_v44f16(<11 x double> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31352,16 +31355,6 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
 ; SI-NEXT:    v_cvt_f32_f16_e32 v50, v56
 ; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
 ; SI-NEXT:  .LBB53_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
@@ -31514,6 +31507,15 @@ define inreg <44 x half> @bitcast_v11f64_to_v44f16_scalar(<11 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -34942,23 +34944,7 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v56
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -34996,6 +34982,22 @@ define <44 x half> @bitcast_v44i16_to_v44f16(<44 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -35797,6 +35799,11 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -35813,11 +35820,6 @@ define inreg <44 x half> @bitcast_v44i16_to_v44f16_scalar(<44 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v52
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -36877,19 +36879,9 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v26, v26, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
@@ -37002,6 +36994,16 @@ define <44 x i16> @bitcast_v44f16_to_v44i16(<44 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -37791,22 +37793,6 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v43
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
@@ -37841,6 +37827,22 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 034b8027851f4..fb2e94fc3b87a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -4045,22 +4045,6 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -4085,6 +4069,22 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -4100,14 +4100,21 @@ define <24 x i32> @bitcast_v48i16_to_v24i32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -6615,24 +6622,8 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -6708,6 +6699,22 @@ define <48 x half> @bitcast_v24i32_to_v48f16(<24 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8172,8 +8179,6 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v24i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -8190,6 +8195,8 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -8216,34 +8223,34 @@ define <24 x i32> @bitcast_v48f16_to_v24i32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -13036,9 +13043,6 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v40
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
@@ -13150,6 +13154,9 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -13875,22 +13882,6 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -13915,6 +13906,22 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -13930,14 +13937,21 @@ define <24 x float> @bitcast_v48i16_to_v24f32(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -16445,24 +16459,8 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -16538,6 +16536,22 @@ define <48 x half> @bitcast_v24f32_to_v48f16(<24 x float> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17219,27 +17233,13 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_or_b32_e32 v55, v55, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT:    v_or_b32_e32 v55, v55, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
@@ -17395,6 +17395,19 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -18144,8 +18157,6 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v24f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -18162,6 +18173,8 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -18188,34 +18201,34 @@ define <24 x float> @bitcast_v48f16_to_v24f32(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -22969,22 +22982,6 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -23009,6 +23006,22 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -23024,14 +23037,21 @@ define <12 x i64> @bitcast_v48i16_to_v12i64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -25539,24 +25559,8 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -25632,6 +25636,22 @@ define <48 x half> @bitcast_v12i64_to_v48f16(<12 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27108,8 +27128,6 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v12i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -27126,6 +27144,8 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -27152,34 +27172,34 @@ define <12 x i64> @bitcast_v48f16_to_v12i64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -30364,9 +30384,6 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v40
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
@@ -30478,6 +30495,9 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -31179,22 +31199,6 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48i16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v48, v14
-; SI-NEXT:    v_mov_b32_e32 v49, v12
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -31219,6 +31223,22 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v48, v14
+; SI-NEXT:    v_mov_b32_e32 v49, v12
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v5
@@ -31234,14 +31254,21 @@ define <12 x double> @bitcast_v48i16_to_v12f64(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56
@@ -33725,24 +33752,8 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v48
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -33818,6 +33829,22 @@ define <48 x half> @bitcast_v12f64_to_v48f16(<12 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34464,28 +34491,13 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_or_b32_e32 v55, v55, v40
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
+; SI-NEXT:    v_or_b32_e32 v55, v55, v40
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
@@ -34641,6 +34653,20 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -35366,8 +35392,6 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-LABEL: bitcast_v48f16_to_v12f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
@@ -35384,6 +35408,8 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
@@ -35410,34 +35436,34 @@ define <12 x double> @bitcast_v48f16_to_v12f64(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:60
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v20
@@ -38310,24 +38336,8 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -38514,6 +38524,22 @@ define <48 x half> @bitcast_v48i16_to_v48f16(<48 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -39425,8 +39451,14 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -39443,12 +39475,6 @@ define inreg <48 x half> @bitcast_v48i16_to_v48f16_scalar(<48 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
@@ -40728,23 +40754,6 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v56
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v59
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
@@ -40758,6 +40767,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v34
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v22
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v59
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
@@ -40765,6 +40775,22 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -41229,11 +41255,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -41250,6 +41271,11 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
@@ -41294,12 +41320,16 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v29, s29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v32
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v20
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v25, v35
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
@@ -41664,6 +41694,9 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
@@ -41680,9 +41713,6 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 8b6210d6a817a..07cdbef82d892 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -2741,14 +2741,9 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -2860,6 +2855,11 @@ define <52 x i16> @bitcast_v26i32_to_v52i16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4341,19 +4341,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v26i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -4379,6 +4366,19 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -4394,12 +4394,17 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -4424,10 +4429,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -4439,9 +4443,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -5027,6 +5032,7 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -5093,7 +5099,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5226,9 +5231,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -5243,6 +5245,9 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -5261,10 +5266,6 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -5293,6 +5294,10 @@ define <26 x i32> @bitcast_v52i16_to_v26i32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7258,6 +7263,11 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -7274,11 +7284,6 @@ define <52 x half> @bitcast_v26i32_to_v52f16(<26 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8046,34 +8051,29 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_or_b32_e32 v43, v43, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v43, vcc, 4, v0
 ; SI-NEXT:    v_or_b32_e32 v41, v41, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    buffer_store_dword v41, v43, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v41, vcc, 8, v0
 ; SI-NEXT:    v_or_b32_e32 v55, v55, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, 12, v0
 ; SI-NEXT:    v_or_b32_e32 v53, v53, v54
@@ -8225,6 +8225,11 @@ define inreg <52 x half> @bitcast_v26i32_to_v52f16_scalar(<26 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
@@ -9755,6 +9760,7 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -9821,7 +9827,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -9954,9 +9959,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -9971,6 +9973,9 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -9990,10 +9995,6 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -10022,6 +10023,10 @@ define <26 x i32> @bitcast_v52f16_to_v26i32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -10290,28 +10295,14 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -10327,6 +10318,22 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -10335,8 +10342,8 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -10356,8 +10363,10 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -10398,11 +10407,11 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -10416,6 +10425,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -10453,6 +10463,7 @@ define inreg <26 x i32> @bitcast_v52f16_to_v26i32_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -13345,14 +13356,9 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -13464,6 +13470,11 @@ define <52 x i16> @bitcast_v26f32_to_v52i16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -14198,14 +14209,6 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14269,6 +14272,14 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -15065,19 +15076,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v26f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -15103,6 +15101,19 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -15118,12 +15129,17 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -15148,10 +15164,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -15163,9 +15178,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -15751,6 +15767,7 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -15817,7 +15834,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -15950,9 +15966,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -15967,6 +15980,9 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -15985,10 +16001,6 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -16017,6 +16029,10 @@ define <26 x float> @bitcast_v52i16_to_v26f32(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -17982,6 +17998,11 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -17998,11 +18019,6 @@ define <52 x half> @bitcast_v26f32_to_v52f16(<26 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -18922,9 +18938,14 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -18941,11 +18962,6 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -20637,6 +20653,7 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -20703,7 +20720,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -20836,9 +20852,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -20853,6 +20866,9 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -20872,10 +20888,6 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -20904,6 +20916,10 @@ define <26 x float> @bitcast_v52f16_to_v26f32(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -21172,28 +21188,14 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -21209,6 +21211,22 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -21217,8 +21235,8 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -21238,8 +21256,10 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -21280,11 +21300,11 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -21298,6 +21318,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -21335,6 +21356,7 @@ define inreg <26 x float> @bitcast_v52f16_to_v26f32_scalar(<52 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -23387,14 +23409,9 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -23506,6 +23523,11 @@ define <52 x i16> @bitcast_v13i64_to_v52i16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -25001,19 +25023,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v13i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -25039,6 +25048,19 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -25054,12 +25076,17 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -25084,10 +25111,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -25099,9 +25125,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -25687,6 +25714,7 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -25753,7 +25781,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -25886,9 +25913,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -25903,6 +25927,9 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -25921,10 +25948,6 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -25953,6 +25976,10 @@ define <13 x i64> @bitcast_v52i16_to_v13i64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -27919,6 +27946,11 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -27935,11 +27967,6 @@ define <52 x half> @bitcast_v13i64_to_v52f16(<13 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -28721,34 +28748,29 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
 ; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
 ; SI-NEXT:    v_or_b32_e32 v43, v43, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
 ; SI-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v43, vcc, 4, v0
 ; SI-NEXT:    v_or_b32_e32 v41, v41, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    buffer_store_dword v41, v43, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v41, vcc, 8, v0
 ; SI-NEXT:    v_or_b32_e32 v55, v55, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    buffer_store_dword v55, v41, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v55, vcc, 12, v0
 ; SI-NEXT:    v_or_b32_e32 v53, v53, v54
@@ -28900,6 +28922,11 @@ define inreg <52 x half> @bitcast_v13i64_to_v52f16_scalar(<13 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -30430,6 +30457,7 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -30496,7 +30524,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -30629,9 +30656,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -30646,6 +30670,9 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -30665,10 +30692,6 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -30697,6 +30720,10 @@ define <13 x i64> @bitcast_v52f16_to_v13i64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -30965,28 +30992,14 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -31002,6 +31015,22 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -31010,8 +31039,8 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -31031,8 +31060,10 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -31073,11 +31104,11 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -31091,6 +31122,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -31128,6 +31160,7 @@ define inreg <13 x i64> @bitcast_v52f16_to_v13i64_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -32365,14 +32398,9 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -32484,6 +32512,11 @@ define <52 x i16> @bitcast_v13f64_to_v52i16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -33179,14 +33212,6 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -33250,6 +33275,14 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -34020,19 +34053,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v52i16_to_v13f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v50, v10
-; SI-NEXT:    v_mov_b32_e32 v51, v8
-; SI-NEXT:    v_mov_b32_e32 v52, v6
-; SI-NEXT:    v_mov_b32_e32 v53, v4
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
-; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
@@ -34058,6 +34078,19 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v50, v10
+; SI-NEXT:    v_mov_b32_e32 v51, v8
+; SI-NEXT:    v_mov_b32_e32 v52, v6
+; SI-NEXT:    v_mov_b32_e32 v53, v4
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_mov_b32_e32 v49, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v5
@@ -34073,12 +34106,17 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v25
 ; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v2
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:8
@@ -34103,10 +34141,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:72
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:64
@@ -34118,9 +34155,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v61, 16, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
@@ -34706,6 +34744,7 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -34772,7 +34811,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -34905,9 +34943,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -34922,6 +34957,9 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v33, v47, s6
@@ -34940,10 +34978,6 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -34972,6 +35006,10 @@ define <13 x double> @bitcast_v52i16_to_v13f64(<52 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -36911,6 +36949,11 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -36927,11 +36970,6 @@ define <52 x half> @bitcast_v13f64_to_v52f16(<13 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -37812,9 +37850,14 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -37831,11 +37874,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -39501,6 +39539,7 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v57, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
@@ -39567,7 +39606,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v62, 16, v56
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 16, v57
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; GFX9-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -39700,9 +39738,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -39717,6 +39752,9 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v34, v57, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v62, v56, s6
@@ -39736,10 +39774,6 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v40, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -39768,6 +39802,10 @@ define <13 x double> @bitcast_v52f16_to_v13f64(<52 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v36, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v35, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -40036,28 +40074,14 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v39
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
@@ -40073,6 +40097,22 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
@@ -40081,8 +40121,8 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
@@ -40102,8 +40142,10 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v42
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v56
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v57
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v62
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
@@ -40144,11 +40186,11 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v25, v38, v25
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(2)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
@@ -40162,6 +40204,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v47
 ; SI-NEXT:    v_cvt_f32_f16_e32 v13, v60
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v52
@@ -40199,6 +40242,7 @@ define inreg <13 x double> @bitcast_v52f16_to_v13f64_scalar(<52 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -41826,24 +41870,8 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -42074,6 +42102,22 @@ define <52 x half> @bitcast_v52i16_to_v52f16(<52 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -43294,7 +43338,23 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v58
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -43311,22 +43371,6 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -44640,25 +44684,9 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v62
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
@@ -44686,6 +44714,22 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -45204,15 +45248,6 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
@@ -45229,6 +45264,15 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
@@ -45273,19 +45317,26 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v41, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v54, s29
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v53, v32
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v33
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
+; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v28
+; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v36
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v38
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v39
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
@@ -45678,25 +45729,9 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -45724,6 +45759,22 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 09cf27810a5c9..8eb71e90f8504 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -2928,18 +2928,9 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -3039,6 +3030,15 @@ define <56 x i16> @bitcast_v28i32_to_v56i16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4665,11 +4665,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v28i32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -4699,6 +4694,11 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -4715,8 +4715,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -5412,6 +5413,7 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -5484,7 +5486,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -5633,9 +5634,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -5650,6 +5648,9 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -5668,10 +5669,6 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -5700,6 +5697,10 @@ define <28 x i32> @bitcast_v56i16_to_v28i32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -7829,7 +7830,21 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -7846,20 +7861,6 @@ define <56 x half> @bitcast_v28i32_to_v56f16(<28 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8742,15 +8743,6 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v53, v55, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v53, vcc, 24, v0
@@ -8896,6 +8888,15 @@ define inreg <56 x half> @bitcast_v28i32_to_v56f16_scalar(<28 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
@@ -10559,6 +10560,7 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -10631,7 +10633,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -10780,9 +10781,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -10797,6 +10795,9 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -10816,10 +10817,6 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -10848,6 +10845,10 @@ define <28 x i32> @bitcast_v56f16_to_v28i32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11147,20 +11148,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -11168,6 +11156,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -11199,6 +11188,19 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -11215,11 +11217,11 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -11297,7 +11299,6 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB19_3
 ; SI-NEXT:  .LBB19_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -11316,6 +11317,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -11583,6 +11585,7 @@ define inreg <28 x i32> @bitcast_v56f16_to_v28i32_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -14431,18 +14434,9 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14542,6 +14536,15 @@ define <56 x i16> @bitcast_v28f32_to_v56i16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -15379,21 +15382,9 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15421,6 +15412,18 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -16287,11 +16290,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v28f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -16321,6 +16319,11 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -16337,8 +16340,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -17034,6 +17038,7 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -17106,7 +17111,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -17255,9 +17259,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -17272,6 +17273,9 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -17290,10 +17294,6 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -17322,6 +17322,10 @@ define <28 x float> @bitcast_v56i16_to_v28f32(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -19451,7 +19455,21 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -19468,20 +19486,6 @@ define <56 x half> @bitcast_v28f32_to_v56f16(<28 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -20469,12 +20473,28 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x60, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v43
+; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v57
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v41
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -20491,22 +20511,6 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -22339,6 +22343,7 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -22411,7 +22416,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -22560,9 +22564,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -22577,6 +22578,9 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -22596,10 +22600,6 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -22628,6 +22628,10 @@ define <28 x float> @bitcast_v56f16_to_v28f32(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -22927,20 +22931,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -22948,6 +22939,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -22979,6 +22971,19 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -22995,11 +23000,11 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -23077,7 +23082,6 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB35_3
 ; SI-NEXT:  .LBB35_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -23096,6 +23100,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -23363,6 +23368,7 @@ define inreg <28 x float> @bitcast_v56f16_to_v28f32_scalar(<56 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -25323,18 +25329,9 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -25434,6 +25431,15 @@ define <56 x i16> @bitcast_v14i64_to_v56i16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -27074,11 +27080,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v14i64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -27108,6 +27109,11 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -27124,8 +27130,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -27821,6 +27828,7 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -27893,7 +27901,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -28042,9 +28049,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -28059,6 +28063,9 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -28077,10 +28084,6 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -28109,6 +28112,10 @@ define <14 x i64> @bitcast_v56i16_to_v14i64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -30238,7 +30245,21 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -30255,20 +30276,6 @@ define <56 x half> @bitcast_v14i64_to_v56f16(<14 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v33
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31165,15 +31172,6 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
 ; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v53, v55, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v53, vcc, 24, v0
@@ -31319,6 +31317,15 @@ define inreg <56 x half> @bitcast_v14i64_to_v56f16_scalar(<14 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -32982,6 +32989,7 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -33054,7 +33062,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -33203,9 +33210,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -33220,6 +33224,9 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -33239,10 +33246,6 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -33271,6 +33274,10 @@ define <14 x i64> @bitcast_v56f16_to_v14i64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -33570,20 +33577,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -33591,6 +33585,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -33622,6 +33617,19 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -33638,11 +33646,11 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -33720,7 +33728,6 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -33739,6 +33746,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -34006,6 +34014,7 @@ define inreg <14 x i64> @bitcast_v56f16_to_v14i64_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -35103,18 +35112,9 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -35214,6 +35214,15 @@ define <56 x i16> @bitcast_v14f64_to_v56i16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -36009,21 +36018,9 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -36051,6 +36048,18 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -36889,11 +36898,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-LABEL: bitcast_v56i16_to_v14f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v54, v2
-; SI-NEXT:    v_mov_b32_e32 v55, v0
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
@@ -36923,6 +36927,11 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v2
+; SI-NEXT:    v_mov_b32_e32 v55, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92
+; SI-NEXT:    v_mov_b32_e32 v53, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v5
@@ -36939,8 +36948,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v27
 ; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v29
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88
@@ -37636,6 +37646,7 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -37708,7 +37719,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -37857,9 +37867,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -37874,6 +37881,9 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v57, s6
@@ -37892,10 +37902,6 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -37924,6 +37930,10 @@ define <14 x double> @bitcast_v56i16_to_v14f64(<56 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -40031,7 +40041,14 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -40048,13 +40065,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v36
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -41025,6 +41035,11 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v21
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -41041,11 +41056,6 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v21
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -42850,6 +42860,7 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v59, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
@@ -42922,7 +42933,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v58
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v59
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
-; GFX9-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -43071,9 +43081,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
@@ -43088,6 +43095,9 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v36, v59, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v35, v58, s6
@@ -43107,10 +43117,6 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v42, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -43139,6 +43145,10 @@ define <14 x double> @bitcast_v56f16_to_v14f64(<56 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v38, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v37, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -43438,20 +43448,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -43459,6 +43456,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v51
+; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v61
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
@@ -43490,6 +43488,19 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
@@ -43506,11 +43517,11 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_mov_b32_e32 v48, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; SI-NEXT:    v_mov_b32_e32 v61, v44
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
 ; SI-NEXT:    v_mov_b32_e32 v39, v11
@@ -43588,7 +43599,6 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v27, v50, v27
 ; SI-NEXT:    s_cbranch_execnz .LBB55_3
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v43
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v54
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
@@ -43607,6 +43617,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v33
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v47
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
@@ -43874,6 +43885,7 @@ define inreg <14 x double> @bitcast_v56f16_to_v14f64_scalar(<56 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -45383,24 +45395,8 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -45666,6 +45662,22 @@ define <56 x half> @bitcast_v56i16_to_v56f16(<56 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -47029,22 +47041,6 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v49
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v34
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -47065,6 +47061,22 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -48501,22 +48513,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v60
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
@@ -48539,6 +48535,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -49679,22 +49691,6 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -49716,6 +49712,22 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 4175d5f2de73d..93c11f13ce3ce 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -3108,22 +3108,9 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -3199,6 +3186,19 @@ define <60 x i16> @bitcast_v30i32_to_v60i16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5032,53 +5032,40 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -5109,10 +5096,27 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -5197,6 +5201,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -5341,6 +5346,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -5488,7 +5494,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v30i32:
@@ -5770,6 +5776,7 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -5848,7 +5855,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -6013,9 +6019,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB14_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -6030,6 +6033,9 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -6048,10 +6054,6 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -6080,6 +6082,10 @@ define <30 x i32> @bitcast_v60i16_to_v30i32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -8381,6 +8387,11 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -8397,11 +8408,6 @@ define <60 x half> @bitcast_v30i32_to_v60f16(<30 x i32> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -9387,24 +9393,11 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 44, v0
 ; SI-NEXT:    v_or_b32_e32 v36, v38, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v36, v39, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_add_i32_e32 v37, vcc, 48, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v34, v36, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
@@ -9526,6 +9519,19 @@ define inreg <60 x half> @bitcast_v30i32_to_v60f16_scalar(<30 x i32> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
@@ -10339,9 +10345,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -10370,12 +10373,23 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -10385,6 +10399,8 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -10406,18 +10422,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -10427,7 +10434,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -10465,6 +10471,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -10479,7 +10486,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -11351,6 +11357,7 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -11429,7 +11436,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -11594,9 +11600,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -11611,6 +11614,9 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -11630,10 +11636,6 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -11662,6 +11664,10 @@ define <30 x i32> @bitcast_v60f16_to_v30i32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -11982,35 +11988,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -12020,7 +12003,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12029,7 +12012,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -12038,7 +12021,7 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -12049,12 +12032,38 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB19_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -12079,12 +12088,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -12193,10 +12202,12 @@ define inreg <30 x i32> @bitcast_v60f16_to_v30i32_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB19_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -15520,22 +15531,9 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15611,6 +15609,19 @@ define <60 x i16> @bitcast_v30f32_to_v60i16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16541,7 +16552,12 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -16557,11 +16573,6 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
@@ -17559,53 +17570,40 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -17636,10 +17634,27 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -17724,6 +17739,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -17868,6 +17884,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -18015,7 +18032,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v30f32:
@@ -18297,6 +18314,7 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -18375,7 +18393,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -18540,9 +18557,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB30_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -18557,6 +18571,9 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -18575,10 +18592,6 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -18607,6 +18620,10 @@ define <30 x float> @bitcast_v60i16_to_v30f32(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -20908,6 +20925,11 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -20924,11 +20946,6 @@ define <60 x half> @bitcast_v30f32_to_v60f16(<30 x float> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -21844,7 +21861,7 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v46
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v54
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
@@ -22011,7 +22028,24 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v4, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v38
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x70, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -22028,23 +22062,6 @@ define inreg <60 x half> @bitcast_v30f32_to_v60f16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v54
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v38
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x70, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB33_4:
@@ -23027,9 +23044,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -23058,12 +23072,23 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -23073,6 +23098,8 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -23094,18 +23121,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -23115,7 +23133,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -23153,6 +23170,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -23167,7 +23185,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -24039,6 +24056,7 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -24117,7 +24135,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -24282,9 +24299,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB34_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -24299,6 +24313,9 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -24318,10 +24335,6 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -24350,6 +24363,10 @@ define <30 x float> @bitcast_v60f16_to_v30f32(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -24670,6 +24687,50 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
@@ -24694,55 +24755,14 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB35_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -24767,12 +24787,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -24881,10 +24901,12 @@ define inreg <30 x float> @bitcast_v60f16_to_v30f32_scalar(<60 x half> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB35_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -27278,22 +27300,9 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -27369,6 +27378,19 @@ define <60 x i16> @bitcast_v15i64_to_v60i16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -29218,53 +29240,40 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -29295,10 +29304,27 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -29383,6 +29409,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -29527,6 +29554,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -29674,7 +29702,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v15i64:
@@ -29956,6 +29984,7 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -30034,7 +30063,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -30199,9 +30227,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB42_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -30216,6 +30241,9 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -30234,10 +30262,6 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -30266,6 +30290,10 @@ define <15 x i64> @bitcast_v60i16_to_v15i64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -32568,6 +32596,11 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
@@ -32584,11 +32617,6 @@ define <60 x half> @bitcast_v15i64_to_v60f16(<15 x i64> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -33590,24 +33618,11 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_add_i32_e32 v39, vcc, 44, v0
 ; SI-NEXT:    v_or_b32_e32 v36, v38, v36
 ; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_store_dword v36, v39, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v36, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_add_i32_e32 v37, vcc, 48, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v34, v36, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; SI-NEXT:    buffer_store_dword v34, v37, s[0:3], 0 offen
@@ -33729,6 +33744,19 @@ define inreg <60 x half> @bitcast_v15i64_to_v60f16_scalar(<15 x i64> inreg %a, i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
@@ -34542,9 +34570,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -34573,12 +34598,23 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -34588,6 +34624,8 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -34609,18 +34647,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -34630,7 +34659,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -34668,6 +34696,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -34682,7 +34711,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -35554,6 +35582,7 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -35632,7 +35661,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -35797,9 +35825,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB46_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -35814,6 +35839,9 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -35833,10 +35861,6 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -35865,6 +35889,10 @@ define <15 x i64> @bitcast_v60f16_to_v15i64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -36185,35 +36213,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
@@ -36223,7 +36228,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36232,7 +36237,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -36241,7 +36246,7 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
 ; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -36252,12 +36257,38 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -36282,12 +36313,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -36396,10 +36427,12 @@ define inreg <15 x i64> @bitcast_v60f16_to_v15i64_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB47_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -37889,22 +37922,9 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37980,6 +38000,19 @@ define <60 x i16> @bitcast_v15f64_to_v60i16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38866,7 +38899,12 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -38883,11 +38921,6 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
@@ -39855,53 +39888,40 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32
 ; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:104
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:100
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v62, 16, v8
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:96
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_lshlrev_b32_e32 v63, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:84
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:48
@@ -39932,10 +39952,27 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
 ; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:64
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v28
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -40020,6 +40057,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; kill: killed $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v55
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v61
 ; SI-NEXT:    ; kill: killed $vgpr30
@@ -40164,6 +40202,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v55
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v61
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -40311,7 +40350,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v60i16_to_v15f64:
@@ -40593,6 +40632,7 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -40671,7 +40711,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -40836,9 +40875,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB50_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -40853,6 +40889,9 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
 ; GFX9-NEXT:    v_perm_b32 v2, v62, v59, s6
@@ -40871,10 +40910,6 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -40903,6 +40938,10 @@ define <15 x double> @bitcast_v60i16_to_v15f64(<60 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
@@ -43134,24 +43173,8 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v62
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -43195,6 +43218,22 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -44250,6 +44289,11 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -44266,11 +44310,6 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
@@ -45223,9 +45262,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32
@@ -45254,12 +45290,23 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v59, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v57, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v7
@@ -45269,6 +45316,8 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v9
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v8
@@ -45290,18 +45339,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v14
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v17
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:92
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v60
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v16
@@ -45311,7 +45351,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v21
@@ -45349,6 +45388,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v61
 ; SI-NEXT:    v_cvt_f16_f32_e32 v61, v49
+; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v55
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -45363,7 +45403,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v32
-; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v47
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -46235,6 +46274,7 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_mov_b32_e32 v61, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
@@ -46313,7 +46353,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 16, v60
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v61
-; GFX9-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -46478,9 +46517,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
@@ -46495,6 +46531,9 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v38, v61, s6
 ; GFX9-NEXT:    s_movk_i32 s7, 0x200
 ; GFX9-NEXT:    v_perm_b32 v1, v37, v60, s6
@@ -46514,10 +46553,6 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v6, v6, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v7, v7, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v8, v8, s7 op_sel_hi:[1,0]
-; GFX9-NEXT:    s_waitcnt vmcnt(14)
-; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
-; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(15)
 ; GFX9-NEXT:    v_perm_b32 v9, v9, v44, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(14)
@@ -46546,6 +46581,10 @@ define <15 x double> @bitcast_v60f16_to_v15f64(<60 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v21, v21, v48, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_perm_b32 v22, v22, v39, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v23, v24, v23, s6
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s7 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v11, v11, s7 op_sel_hi:[1,0]
@@ -46866,6 +46905,50 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s28
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
@@ -46890,55 +46973,14 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v33
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v42
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v38
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v44
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v45
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v46
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v47
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v56
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v57
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, s17
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v60
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v3, v10, v3
+; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v34
 ; SI-NEXT:    v_mov_b32_e32 v33, v32
 ; SI-NEXT:    v_or_b32_e32 v10, v32, v10
@@ -46963,12 +47005,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v12, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v11, v2
@@ -47077,10 +47119,12 @@ define inreg <15 x double> @bitcast_v60f16_to_v15f64_scalar(<60 x half> inreg %a
 ; SI-NEXT:    v_mov_b32_e32 v40, v44
 ; SI-NEXT:    s_cbranch_vccnz .LBB55_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
+; SI-NEXT:    s_waitcnt expcnt(5)
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(4)
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v59
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v33
@@ -49257,23 +49301,7 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v47
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -49311,6 +49339,22 @@ define <60 x half> @bitcast_v60i16_to_v60f16(<60 x i16> %a, i32 %b) {
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -50812,22 +50856,6 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v55
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v50
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -50848,6 +50876,22 @@ define inreg <60 x half> @bitcast_v60i16_to_v60f16_scalar(<60 x i16> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -51849,27 +51893,27 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v4
 ; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v49, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v37, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v52, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v48, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
@@ -52404,7 +52448,18 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
@@ -52421,17 +52476,6 @@ define <60 x i16> @bitcast_v60f16_to_v60i16(<60 x half> %a, i32 %b) {
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -53215,8 +53259,6 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -53243,13 +53285,10 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v51, v11
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
@@ -53261,26 +53300,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v48
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
-; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
@@ -53308,11 +53329,17 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v18, v3, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v51, v11
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
@@ -53355,32 +53382,52 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
 ; SI-NEXT:    v_lshr_b64 v[58:59], v[34:35], 16
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
-; SI-NEXT:    v_mov_b32_e32 v59, v48
 ; SI-NEXT:    v_or_b32_e32 v6, v3, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v31
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v60, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_mov_b32_e32 v59, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v60
 ; SI-NEXT:    v_or_b32_e32 v4, v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v39
-; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v20
@@ -53477,15 +53524,14 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v32, v41
 ; SI-NEXT:    v_lshr_b64 v[40:41], v[21:22], 16
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[11:12], 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[56:57], 16
 ; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    v_mov_b32_e32 v11, v24
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
 ; SI-NEXT:    v_mov_b32_e32 v39, v31
 ; SI-NEXT:    v_mov_b32_e32 v31, v60
@@ -53495,6 +53541,7 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v37, v55
 ; SI-NEXT:    v_lshr_b64 v[55:56], v[5:6], 16
 ; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
 ; SI-NEXT:  .LBB59_3: ; %end
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v58
@@ -53615,15 +53662,15 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
@@ -53634,11 +53681,9 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
@@ -53648,9 +53693,11 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
@@ -53675,24 +53722,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
@@ -53717,6 +53748,22 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index b6b59d809306a..30ad46d959b7e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -968,14 +968,14 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
 ; GFX8-NEXT:    s_movk_i32 s4, 0x70
 ; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
 ; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
-; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
 ; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -9552,7 +9552,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 36, v1
 ; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
 ; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
@@ -9564,6 +9563,7 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX8-NEXT:    buffer_store_dword v58, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 38, v1
 ; GFX8-NEXT:    flat_load_ushort v44, v[1:2]
 ; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 40, v1
@@ -9686,17 +9686,6 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v51
 ; GFX8-NEXT:    v_cvt_f64_f32_e32 v[15:16], v15
-; GFX8-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xb8, v0
 ; GFX8-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xb4, v0
@@ -9827,6 +9816,17 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
 ; GFX8-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
 ; GFX8-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v58, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 04f8ad8a02303..68313807c427f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -450,38 +450,23 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -991,38 +976,23 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
 ; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
 ; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(15)
-; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
 ; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
@@ -1189,23 +1159,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s9, s12
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s6, s3
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s8, s1
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; SDAG-GFX1100-NEXT:    s_clause 0x1
 ; SDAG-GFX1100-NEXT:    s_load_b32 s13, s[4:5], 0x54
 ; SDAG-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s5, s12
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s4, s3
-; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s2, s1
+; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; SDAG-GFX1100-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0)
@@ -1249,12 +1220,12 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 832e43f1e1973..8e12e7e03947b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4253,7 +4253,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4261,6 +4260,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; VI-NEXT:    s_mov_b32 s38, -1
 ; VI-NEXT:    s_mov_b32 s39, 0xe80000
 ; VI-NEXT:    s_add_u32 s36, s36, s3
@@ -4272,7 +4272,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; VI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; VI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    s_waitcnt vmcnt(7)
 ; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; VI-NEXT:    s_endpgm
@@ -4285,7 +4285,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4293,6 +4292,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; CI-NEXT:    s_mov_b32 s38, -1
 ; CI-NEXT:    s_mov_b32 s39, 0xe8f000
 ; CI-NEXT:    s_add_u32 s36, s36, s3
@@ -4304,7 +4304,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; CI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; CI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; CI-NEXT:    s_endpgm
@@ -4317,7 +4317,6 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
 ; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
@@ -4325,6 +4324,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
 ; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
 ; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
 ; GFX9-NEXT:    s_mov_b32 s38, -1
 ; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s36, s36, s3
@@ -4336,7 +4336,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
 ; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 5cc68451d5ab7..0cae0e51107df 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -851,12 +851,12 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    ds_write_b8 v0, v1 offset:9
-; CI-NEXT:    ds_write_b8 v0, v2 offset:13
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
 ; CI-NEXT:    ds_write_b8 v0, v1 offset:5
 ; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; CI-NEXT:    ds_write_b8 v0, v1 offset:9
+; CI-NEXT:    ds_write_b8 v0, v2 offset:13
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 24, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
index a4b3a8544dede..683887b0a55f3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2_a_v.ll
@@ -476,6 +476,7 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
 ; GCN-NEXT:    v_accvgpr_write_b32 a31, v19 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a30, v20 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a29, v21 ; Reload Reuse
@@ -488,7 +489,6 @@ define void @ds_write2_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_write_b32 a22, v28 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a21, v29 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a20, v30 ; Reload Reuse
-; GCN-NEXT:    ds_write2_b32 v0, v1, v2 offset0:10 offset1:24
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -1029,6 +1029,7 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
 ; GCN-NEXT:    v_accvgpr_write_b32 a31, v21 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a30, v22 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a29, v23 ; Reload Reuse
@@ -1039,7 +1040,6 @@ define void @ds_write2_b64_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
 ; GCN-NEXT:    v_accvgpr_write_b32 a24, v28 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a23, v29 ; Reload Reuse
 ; GCN-NEXT:    v_accvgpr_write_b32 a22, v30 ; Reload Reuse
-; GCN-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:24
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index da08f4fcf8f3d..5fb50d0d89530 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -3755,44 +3755,42 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
 ; CI-NEXT:    v_cvt_f16_f32_e32 v13, v22
 ; CI-NEXT:    v_or_b32_e32 v10, v14, v10
-; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
 ; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; CI-NEXT:    v_or_b32_e32 v17, v18, v17
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v27
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; CI-NEXT:    v_or_b32_e32 v13, v16, v13
 ; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; CI-NEXT:    v_or_b32_e32 v19, v20, v19
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
 ; CI-NEXT:    v_cvt_f16_f32_e32 v21, v30
 ; CI-NEXT:    v_or_b32_e32 v20, v22, v20
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v29
-; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    s_waitcnt vmcnt(6)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    s_waitcnt vmcnt(7)
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; CI-NEXT:    v_or_b32_e32 v21, v22, v21
 ; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    s_waitcnt vmcnt(3)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; CI-NEXT:    s_waitcnt vmcnt(4)
+; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v32, v32
 ; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; CI-NEXT:    v_cvt_f32_f16_e32 v32, v32
@@ -3804,27 +3802,6 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
 ; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
-; CI-NEXT:    s_waitcnt vmcnt(6)
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    s_waitcnt vmcnt(5)
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; CI-NEXT:    v_or_b32_e32 v14, v15, v14
-; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; CI-NEXT:    v_or_b32_e32 v12, v12, v15
-; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
-; CI-NEXT:    v_or_b32_e32 v11, v16, v11
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3991,6 +3968,28 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_or_b32_e32 v31, v32, v31
 ; CI-NEXT:    v_add_i32_e32 v32, vcc, 0x48, v0
 ; CI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; CI-NEXT:    v_or_b32_e32 v14, v15, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; CI-NEXT:    v_or_b32_e32 v12, v12, v15
+; CI-NEXT:    v_or_b32_e32 v11, v16, v11
+; CI-NEXT:    v_add_i32_e32 v15, vcc, 0x44, v0
 ; CI-NEXT:    buffer_store_dword v11, v15, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
 ; CI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 590d69b8eb869..279f4298e6418 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -1,19 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s
 
---- |
-
-  @foo = addrspace(3) global i32 poison
-
-  define void @test_overlap() { unreachable }
-  define void @test_dead_redef() { unreachable }
-  define void @test_tied() { unreachable }
-  define void @test_mmo_merge1() { unreachable }
-  define void @test_mmo_merge2() { unreachable }
-  define void @test_mmo_drop() { unreachable }
-
-...
-
 ---
 name: test_overlap
 body: |
@@ -60,42 +47,3 @@ body: |
     %1:vgpr_32 = COPY %0:vgpr_32
     %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
 ...
-
----
-name: test_mmo_merge1
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: test_mmo_merge1
-    ; CHECK: BUNDLE implicit-def %0, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3) {
-    ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
-    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %1:vgpr_32, internal [[COPY]], 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
-    ; CHECK-NEXT: }
-    %1:vgpr_32 = COPY %0:vgpr_32
-    DS_WRITE_B32_gfx9 %0, %1, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
-...
-
----
-name: test_mmo_merge2
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: test_mmo_merge2
-    ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec :: (store (s32) into @foo, addrspace 3), (store (s32) into @foo + 4, addrspace 3) {
-    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
-    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
-    ; CHECK-NEXT: }
-    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
-    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec :: (store (s32) into @foo + 4, addrspace 3)
-...
-
----
-name: test_mmo_drop
-body: |
-  bb.0:
-    ; CHECK-LABEL: name: test_mmo_drop
-    ; CHECK: BUNDLE implicit %0:vgpr_32, implicit %1:vgpr_32, implicit $exec {
-    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
-    ; CHECK-NEXT:   DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
-    ; CHECK-NEXT: }
-    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 0, 0, implicit $exec :: (store (s32) into @foo, addrspace 3)
-    DS_WRITE_B32_gfx9 %0:vgpr_32, %1:vgpr_32, 4, 0, implicit $exec
-...
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index d43c6ba322619..b750d28ffa7d3 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -807,7 +807,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX10-NEXT:    buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x1f
 ; GFX10-NEXT:    buffer_load_dword v95, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:8
@@ -863,7 +863,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s1, return_100xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, return_100xi32@abs32@lo
 ; GFX11-NEXT:    s_addk_i32 s32, 0x90
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:124
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:120
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:116
@@ -898,7 +898,7 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v95, s33
 ; GFX11-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x1f
 ; GFX11-NEXT:    scratch_load_b32 v95, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v94, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v93, off, s33 offset:8
@@ -2416,6 +2416,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:148
 ; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:156
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
 ; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
 ; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
 ; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
@@ -2458,7 +2459,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    s_clause 0x8
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
@@ -2467,7 +2468,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:8
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
 ; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2518,7 +2518,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0xc ; 52-byte Folded Spill
+; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:204
@@ -2551,23 +2551,23 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
 ; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
 ; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
+; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
 ; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
 ; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    s_clause 0x10
+; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
 ; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
 ; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-NEXT:    s_clause 0xd
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
@@ -2608,7 +2608,7 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
 ; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
 ; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    s_clause 0xc ; 52-byte Folded Reload
+; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:172
@@ -2641,6 +2641,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2718,21 +2733,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
@@ -2914,7 +2914,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_mov_b32 s38, s34
 ; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2957,11 +2971,12 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
@@ -2991,24 +3006,9 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX10-NEXT:    s_mov_b32 s37, return_72xi32@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s36, return_72xi32@abs32@lo
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    s_clause 0x3e
+; GFX10-NEXT:    s_clause 0x28
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
@@ -3050,29 +3050,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:792
 ; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
-; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
-; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
-; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:556
-; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:560
-; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:564
-; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:568
-; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:572
-; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:576
-; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:580
-; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:584
-; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:588
-; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:592
-; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:596
-; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:600
-; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:604
-; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:608
-; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:612
-; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:616
-; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:620
-; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:624
-; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:628
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:632
-; GFX10-NEXT:    s_waitcnt vmcnt(22)
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -3095,6 +3073,29 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:544
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_clause 0x15
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:556
+; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:560
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:564
+; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:568
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:572
+; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:576
+; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:580
+; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:584
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:588
+; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:592
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:596
+; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:600
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:604
+; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:608
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:612
+; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:616
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:620
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:624
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:628
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:632
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
 ; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s32
@@ -3137,7 +3138,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152
 ; GFX10-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
-; GFX10-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; GFX10-NEXT:    s_clause 0x7
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544
@@ -3150,7 +3151,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX10-NEXT:    s_clause 0xe ; 60-byte Folded Reload
+; GFX10-NEXT:    s_clause 0xe
 ; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8
@@ -3198,7 +3199,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_mov_b32 s36, s34
 ; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xb ; 48-byte Folded Spill
+; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:44
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:40
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:36
@@ -3340,18 +3341,18 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_add_i32 s2, s32, 16
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s2
-; GFX11-NEXT:    s_clause 0x3 ; 64-byte Folded Reload
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1568
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1552
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1536
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0xb ; 48-byte Folded Reload
+; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_load_b32 v59, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 93d7eeb085107..f80716939f618 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -255,11 +255,11 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
 ; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
 ; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:64
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 48
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 32
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[64:65], 16
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[66:67], 0
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0x50
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 64
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
index 03b56cad85dac..7e1055b2a28a4 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir
@@ -11,7 +11,7 @@ body: |
     ; CHECK-LABEL: name: mimg_nsa
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
     ; CHECK-NEXT:   S_CLAUSE 1
     ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -29,7 +29,7 @@ body: |
     ; CHECK-LABEL: name: mimg_nsa_mixed
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr8, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5_vgpr6 {
     ; CHECK-NEXT:   S_CLAUSE 2
     ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx11 $vgpr3, $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx11 $vgpr5_vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
index 68f9e839012c3..9689dda9932ed 100644
--- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx12.mir
@@ -10,7 +10,7 @@ body: |
     ; CHECK-LABEL: name: mimg
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec :: (load (s128)) {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10_vgpr11_vgpr12, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec {
     ; CHECK-NEXT:   S_CLAUSE 1
     ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr20_vgpr21_vgpr22 = IMAGE_SAMPLE_LZ_V3_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 14, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
@@ -28,7 +28,7 @@ body: |
     ; CHECK-LABEL: name: mimg_mixed
     ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 :: (load (s128)), (dereferenceable load (s128), addrspace 7) {
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr10, implicit-def $vgpr14, implicit-def $vgpr20_vgpr21_vgpr22, implicit $vgpr3, implicit $vgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit $vgpr5, implicit $vgpr6 {
     ; CHECK-NEXT:   S_CLAUSE 2
     ; CHECK-NEXT:   $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr3, $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s128))
     ; CHECK-NEXT:   $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx12 $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
index cbf697fafe683..4719ab9090fa5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll
@@ -1,20 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -stop-after=postrapseudos -o - < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
 
 
+; MIR-LABEL: name: gws_barrier_offset0{{$}}
+; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
+; MIR-NEXT: S_WAITCNT 0
+; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
-  ; MIR-LABEL: name: gws_barrier_offset0
-  ; MIR: bb.0 (%ir-block.0):
-  ; MIR-NEXT:   liveins: $sgpr8_sgpr9
-  ; MIR-NEXT: {{  $}}
-  ; MIR-NEXT:   renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
-  ; MIR-NEXT:   $m0 = S_MOV_B32 0
-  ; MIR-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
-  ; MIR-NEXT:   BUNDLE implicit killed renamable $vgpr0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource") {
-  ; MIR-NEXT:     DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
-  ; MIR-NEXT:     S_WAITCNT 0
-  ; MIR-NEXT:   }
-  ; MIR-NEXT:   S_ENDPGM 0
   call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
   ret void
 }
@@ -24,3 +17,5 @@ declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { convergent inaccessiblememonly nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index 417b8e08cf669..c5f6e2b0098ae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -35,7 +35,7 @@
 ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
-; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec
+; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
 ; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, implicit $m0, implicit $exec :: (load (s32) from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index af270e5adf75c..4419b8c6f9862 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -26,9 +26,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-FAKE16-NEXT:    v_dot2_bf16_bf16 v1, s2, s3, v1
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 72b47693c69f8..0194d25a99cdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -25,9 +25,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SDAG-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
@@ -38,9 +38,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GISEL-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
 ; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
 ; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
@@ -51,9 +51,9 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GISEL-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
 ; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 1d08097452ce6..0c1448a0b8fb6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,19 +17,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s16
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -41,12 +43,13 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
@@ -172,15 +175,16 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
@@ -203,15 +207,16 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
@@ -515,19 +520,21 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
 ; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; GCN-NEXT:    v_mov_b32_e32 v5, s16
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
@@ -627,15 +634,16 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
 ; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
 ; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
 ; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
@@ -794,11 +802,11 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -807,6 +815,7 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -824,11 +833,12 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -955,14 +965,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -992,14 +1003,15 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -1305,11 +1317,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1318,6 +1330,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1335,11 +1348,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1467,11 +1481,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1480,6 +1494,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1497,11 +1512,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1629,11 +1645,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1642,6 +1658,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1659,11 +1676,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1791,11 +1809,11 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    v_mov_b32_e32 v12, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s10
@@ -1804,6 +1822,7 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s2
@@ -1821,11 +1840,12 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
 ; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
@@ -1952,14 +1972,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -1989,14 +2010,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2301,14 +2323,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2338,14 +2361,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2650,14 +2674,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -2687,14 +2712,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
@@ -2999,14 +3025,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
 ; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
 ; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b32_e32 v24, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s10
@@ -3036,14 +3063,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
 ; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
 ; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
 ; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x64
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[8:9]
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 83c240c17ff1c..f93e5f06beff9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -10386,8 +10386,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x150
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[12:15]
-; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x140
@@ -10396,6 +10395,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v14, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x130
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[16:19]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x120
@@ -10403,21 +10406,20 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x110
-; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v42, vcc_lo
 ; GFX8-NEXT:    v_mov_b32_e32 v43, vcc_hi
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s5
-; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[20:23]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
+; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[24:27]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s15
 ; GFX8-NEXT:    flat_store_dwordx4 v[56:57], v[28:31]
@@ -10586,8 +10588,6 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[20:23]
-; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[24:27]
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 59f4a9d44bbdd..bca39d06e941c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -4582,18 +4582,18 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v3
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v2
 ; GCN-HSA-NEXT:    v_bfe_i32 v22, v3, 0, 16
 ; GCN-HSA-NEXT:    v_bfe_i32 v20, v2, 0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
 ; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 7203545ebf9a8..e55fb2cac0985 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3313,12 +3313,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x90
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
-; GCNX3-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
 ; GCNX3-HSA-NEXT:    v_ashrrev_i32_e32 v24, 31, v6
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v23, v6
@@ -3726,6 +3726,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[37:40], s[0:1] offset:224
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[33:36], s[0:1] offset:240
+; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
 ; GCN-GFX900-HSA-NEXT:    buffer_load_dword v33, off, s[20:23], 0 ; 4-byte Folded Reload
 ; GCN-GFX900-HSA-NEXT:    s_nop 0
 ; GCN-GFX900-HSA-NEXT:    buffer_load_dword v34, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
@@ -3739,7 +3740,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v43, v26
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v29, v27
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v31, v28
-; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(11)
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(12)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v60, 31, v3
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v58, 31, v2
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v28, 31, v1
@@ -3748,7 +3749,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v27, v1
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v57, v2
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v59, v3
-; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[8:11], s[0:1] offset:192
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(7)
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v24
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v23
@@ -3758,7 +3758,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    v_ashrrev_i32_e32 v9, 31, v21
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, v21
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-GFX900-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[33:36], s[0:1] offset:208
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[41:44], s[0:1] offset:160
 ; GCN-GFX900-HSA-NEXT:    global_store_dwordx4 v12, v[29:32], s[0:1] offset:176
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index cb17f01853221..f879dc660203f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -7788,18 +7788,19 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v29, v13, 16, 8
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v35, v14, 8, 8
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v33, 0xff, v14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v53
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v16, v17, 8, 8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v53
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xff, v17
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v54, 24, v17
 ; GCN-NOHSA-VI-NEXT:    v_bfe_u32 v52, v17, 16, 8
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v53
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v53
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v53
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
@@ -7809,7 +7810,7 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v53
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 062a985dd7180..bd191a37582c0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3172,25 +3172,27 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
 ; VI-NO-DS128:       ; %bb.0:
 ; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v16 offset1:1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
 ; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
-; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
@@ -3198,6 +3200,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v18
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v20
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v19
@@ -3240,19 +3243,17 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v55, 0xffff, v19
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
-; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v20
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v18
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, s0
-; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
-; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
 ; VI-NO-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
@@ -3295,17 +3296,21 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v56, s1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v56 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
 ; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
@@ -3332,11 +3337,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v22
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    s_nop 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v40, 0xffff, v17
@@ -3357,17 +3360,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v54, 0xffff, v22
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v63, 16, v23
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v62, 0xffff, v23
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
-; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
 ; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v19
@@ -3804,11 +3806,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
 ; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
-; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
 ; VI-DS128-NEXT:    v_mov_b32_e32 v4, v3
 ; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v11
@@ -3825,16 +3825,23 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
 ; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
 ; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
 ; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
@@ -3843,25 +3850,21 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v22
 ; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
 ; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v20
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
-; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
 ; VI-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
 ; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
 ; VI-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
-; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
@@ -3872,17 +3875,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
 ; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
-; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
 ; VI-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
 ; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
 ; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; VI-DS128-NEXT:    v_mov_b32_e32 v24, s0
-; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; VI-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
 ; VI-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
@@ -3941,11 +3943,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v0
 ; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
 ; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
-; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
-; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v11
@@ -3964,16 +3964,24 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v17
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v16
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v23
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
@@ -3982,26 +3990,21 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v22
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v20
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
-; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
 ; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
 ; GFX9-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
-; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_nop 0
-; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
@@ -4012,17 +4015,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
 ; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
-; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s0
-; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
 ; GFX9-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
 ; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
@@ -4195,20 +4197,29 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
 ; VI-NO-DS128:       ; %bb.0:
 ; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
 ; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
 ; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
@@ -4218,7 +4229,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
@@ -4236,24 +4247,16 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
 ; VI-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
-; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
-; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
-; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
 ; VI-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
-; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
-; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
 ; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
 ; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
@@ -4313,14 +4316,23 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
-; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
 ; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
 ; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
@@ -4330,7 +4342,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
-; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
@@ -4348,24 +4360,16 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
-; GFX9-NO-DS128-NEXT:    s_nop 0
-; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
-; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
-; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
 ; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
@@ -4853,12 +4857,10 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_mov_b32_e32 v32, s1
 ; VI-DS128-NEXT:    ds_read_b128 v[8:11], v32
 ; VI-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
-; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
-; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
 ; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
-; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -4871,6 +4873,12 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v8
 ; VI-DS128-NEXT:    v_bfe_i32 v5, v9, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v3, v8, 0, 16
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v19
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v18
@@ -4891,11 +4899,8 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
 ; VI-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
 ; VI-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
-; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
-; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v25
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
@@ -4908,15 +4913,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; VI-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
 ; VI-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
 ; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; VI-DS128-NEXT:    v_bfe_i32 v22, v25, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v20, v24, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
 ; VI-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
 ; VI-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
-; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
 ; VI-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
 ; VI-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
@@ -4981,11 +4985,9 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s1
 ; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v32
 ; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
-; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
-; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
 ; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v11
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v10
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v11, 0, 16
@@ -4999,6 +5001,13 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v8
 ; GFX9-DS128-NEXT:    v_bfe_i32 v5, v9, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v3, v8, 0, 16
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v19
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v18
@@ -5019,12 +5028,8 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
 ; GFX9-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
 ; GFX9-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
-; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    s_nop 0
-; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
-; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v25
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
@@ -5037,15 +5042,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
 ; GFX9-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
 ; GFX9-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
 ; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v24
 ; GFX9-DS128-NEXT:    v_bfe_i32 v22, v25, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v20, v24, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
 ; GFX9-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
 ; GFX9-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
-; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
 ; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
 ; GFX9-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
index 9da7a79ba2fdf..1d1d3e4a68fee 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll
@@ -15,23 +15,24 @@ define amdgpu_kernel void @buffer_last_use_load_0(ptr addrspace(7) %in, ptr addr
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_mov_b32 s4, s3
-; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -62,10 +63,10 @@ define amdgpu_kernel void @buffer_last_use_load_1(ptr addrspace(7) %in, ptr addr
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s4, s3
@@ -99,24 +100,25 @@ define amdgpu_kernel void @buffer_last_use_and_volatile_load(ptr addrspace(7) %i
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_BYPASS scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s4, s3
-; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
@@ -139,23 +141,24 @@ define amdgpu_kernel void @buffer_last_use_and_nontemporal_load(ptr addrspace(7)
 ; GFX12-NEXT:    s_mov_b32 s9, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_mov_b32 s6, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-NEXT:    s_mov_b32 s8, s1
 ; GFX12-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-NEXT:    s_mov_b32 s5, s12
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_LU
 ; GFX12-NEXT:    s_mov_b32 s4, s3
-; GFX12-NEXT:    s_mov_b32 s3, s12
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-NEXT:    s_mov_b32 s13, s2
 ; GFX12-NEXT:    s_mov_b32 s2, s1
+; GFX12-NEXT:    s_mov_b32 s3, s12
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 84db54c2d537f..fc36ed939d91d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -128,10 +128,10 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -181,23 +181,24 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -214,12 +215,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -238,23 +239,24 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -271,12 +273,12 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -411,11 +413,11 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[10:11]
 ; GFX10-SDAG-NEXT:    s_mov_b32 s11, s2
 ; GFX10-SDAG-NEXT:    s_or_b64 s[4:5], s[12:13], s[10:11]
+; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_clause 0x1
 ; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SDAG-NEXT:    s_mov_b32 s5, s10
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
@@ -466,24 +468,25 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX11-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_clause 0x1
 ; GFX11-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s3
-; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX11-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX11-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX11-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
@@ -500,13 +503,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
@@ -525,24 +528,25 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-SDAG-NEXT:    s_mov_b32 s9, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s6, s3
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s8, s1
 ; GFX12-SDAG-NEXT:    s_or_b64 s[10:11], s[6:7], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b32 s13, s[4:5], 0x30
 ; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-SDAG-NEXT:    s_mov_b32 s5, s12
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-SDAG-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    s_mov_b32 s4, s3
-; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-SDAG-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
 ; GFX12-SDAG-NEXT:    s_mov_b32 s13, s2
 ; GFX12-SDAG-NEXT:    s_mov_b32 s2, s1
+; GFX12-SDAG-NEXT:    s_mov_b32 s3, s12
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
 ; GFX12-SDAG-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
@@ -559,13 +563,13 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index c1f4d7bbf650e..4ab05c2923fdb 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -19,12 +19,12 @@ $_f2 = comdat any
 define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce) local_unnamed_addr #0 {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    ds_write_b8 v1, v0
 ; GCN-NEXT:    ds_read_u8 v2, v1 offset:2
 ; GCN-NEXT:    ds_read_u16 v3, v1
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 2
-; GCN-NEXT:    ds_write_b8 v1, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b8 v1, v2 offset:6
 ; GCN-NEXT:    ds_write_b16 v1, v3 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index ccfd45bc87e71..24c1bfb8d50f0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -75,15 +75,15 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i
 ; GCN-LABEL: no_clobber_ds_load_stores_x3:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 2
-; GCN-NEXT:    ds_write_b32 v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-NEXT:    ds_write_b32 v1, v2 offset:256
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ds_write_b32 v1, v2 offset:256
 ; GCN-NEXT:    v_mov_b32_e32 v2, 3
+; GCN-NEXT:    ds_write_b32 v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    ds_write_b32 v1, v2 offset:512
 ; GCN-NEXT:    ds_read_b32 v2, v0
 ; GCN-NEXT:    ds_read_b32 v3, v0 offset:256
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index ba532949a687d..ae0805448d693 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -774,9 +774,9 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GFX1250-NEXT:    s_load_b32 s6, s[0:1], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
-; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_max_u32_e32 v0, s6, v0
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index af7ca0fb59682..02f39e25cb447 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -714,7 +714,7 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0)
 ; ALIGNED-NEXT:    flat_store_byte v[16:17], v4 offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB0_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x7
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1468,7 +1468,7 @@ define void @memcpy_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1)
 ; ALIGNED-NEXT:    global_store_byte v[16:17], v4, off offset:1
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB1_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x7
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -1854,10 +1854,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
@@ -1866,6 +1862,10 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
@@ -1901,6 +1901,14 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
@@ -1915,14 +1923,6 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
@@ -3438,7 +3438,7 @@ define void @memcpy_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB3_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x2f
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -3741,23 +3741,23 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:  .LBB4_1: ; %load-store-loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    s_clause 0x39
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
 ; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -3779,17 +3779,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:69
@@ -3797,96 +3797,57 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_clause 0x33
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
@@ -3895,83 +3856,76 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v35
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v50, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
-; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v64, 8, v54
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v65
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
 ; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
@@ -3980,82 +3934,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(38)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(36)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
-; ALIGNED-NEXT:    v_lshl_or_b32 v73, v13, 8, v16
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
@@ -4327,132 +4251,259 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v102, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT:    s_clause 0x3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v9, 8, v10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v127, 8, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v113, 8, v116
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v123, 8, v125
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v110
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v93, 8, v94
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v104
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v108
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v90
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v88
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v76
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v60, 8, v62
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v57
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v47, 8, v58
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v44
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v119, 8, v41
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v118
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    s_clause 0x6
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v113, 8, v116
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v102, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v106, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v103, 8, v114
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v100, 8, v112
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v123, 8, v125
 ; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(60)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v97, 8, v98
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v93, 8, v94
+; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v87, 8, v96
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v110
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
+; ALIGNED-NEXT:    v_lshl_or_b32 v73, v13, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v9, 8, v10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v89, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v84, 8, v86
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v85, 8, v83
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v104
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v74, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v81
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v108
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v65, 8, v71
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v88
 ; ALIGNED-NEXT:    v_lshl_or_b32 v46, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v80
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v54, 8, v53
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v90
 ; ALIGNED-NEXT:    v_lshl_or_b32 v117, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v55, 8, v70
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v52, 8, v64
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v76
 ; ALIGNED-NEXT:    v_lshl_or_b32 v115, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v50
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v48
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v57
 ; ALIGNED-NEXT:    v_lshl_or_b32 v99, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v36, 8, v38
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v60, 8, v62
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v32, 8, v33
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v82, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v37
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v31, 8, v34
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v47, 8, v58
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v66, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v29, 8, v30
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v26, 8, v28
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v42, 8, v44
 ; ALIGNED-NEXT:    v_lshl_or_b32 v51, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v25, 8, v21
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
 ; ALIGNED-NEXT:    v_lshl_or_b32 v49, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v18, 8, v20
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v14, 8, v15
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v118
 ; ALIGNED-NEXT:    v_lshl_or_b32 v27, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v17, 8, v19
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v119, 8, v41
 ; ALIGNED-NEXT:    v_lshl_or_b32 v22, v73, 16, v4
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v11, 8, v12
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v77, 16, v73
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v6, 8, v8
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v7, 8, v5
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen
-; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v77, 16, v73
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1
 ; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -4462,34 +4513,37 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v109, 8, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v1, 8, v120
-; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
 ; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v73, 8, v1
-; ALIGNED-NEXT:    v_mov_b32_e32 v1, v107
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v107, 8, v0
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v107, 8, v0
+; ALIGNED-NEXT:    v_mov_b32_e32 v1, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v120, 8, v122
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v77, v121, 8, v109
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v77, 16, v73
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16
@@ -4499,7 +4553,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:228
 ; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:224
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v126, 8, v77
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
@@ -4572,7 +4625,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v89, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v83 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v85 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v84 offset:201
@@ -4589,6 +4641,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v112 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v114 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:192
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v119 offset:185
@@ -4605,6 +4672,18 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v58 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v61 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v63 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v72 offset:169
@@ -4621,36 +4700,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v104 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v108 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v110 offset:160
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:154
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:153
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v127 offset:159
-; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT:    v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
-; ALIGNED-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
@@ -4663,6 +4712,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v123 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v127 offset:159
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:157
@@ -5128,8 +5181,6 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:388
 ; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:384
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:10
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:11
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:26
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
@@ -5183,6 +5234,8 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v109 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:11
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
@@ -5221,7 +5274,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0
 ; ALIGNED-NEXT:    s_cbranch_vccnz .LBB4_1
 ; ALIGNED-NEXT:  ; %bb.2: ; %memcpy-split
-; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x2f
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -6744,7 +6797,7 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB5_5
 ; ALIGNED-NEXT:  .LBB5_6: ; %Flow6
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x7
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8243,7 +8296,7 @@ define void @memmove_p1_p1_sz2048(ptr addrspace(1) align 1 %dst, ptr addrspace(1
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB6_5
 ; ALIGNED-NEXT:  .LBB6_6: ; %Flow8
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x7 ; 32-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x7
 ; ALIGNED-NEXT:    buffer_load_dword v47, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
@@ -8795,6 +8848,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:60
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v70 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v70, 24, v6
@@ -8810,14 +8871,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v36, 24, v23
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v113, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v98 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v98, 24, v7
@@ -9244,10 +9297,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:476
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:468
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:464
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
-; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v38 offset:138
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v39 offset:142
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v39 offset:140
@@ -9256,6 +9305,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v37 offset:132
 ; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v36 offset:130
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v36 offset:128
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v82 offset:143
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v82, 24, v18
+; ALIGNED-NEXT:    v_lshrrev_b32_e32 v51, 8, v26
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v66 offset:139
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v67 offset:137
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v83 offset:141
@@ -9291,6 +9344,14 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:316
 ; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308
 ; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:304
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
+; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
+; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v69 offset:111
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v69, 24, v6
@@ -9305,14 +9366,6 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v101, 24, v25
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v99, 24, v19
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v86, 24, v15
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v30 offset:106
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v31 offset:110
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v31 offset:108
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v30 offset:104
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v29 offset:102
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v29 offset:100
-; ALIGNED-NEXT:    flat_store_byte_d16_hi v[96:97], v28 offset:98
-; ALIGNED-NEXT:    flat_store_byte v[96:97], v28 offset:96
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v28, 24, v11
 ; ALIGNED-NEXT:    flat_store_byte v[96:97], v71 offset:103
 ; ALIGNED-NEXT:    v_lshrrev_b32_e32 v71, 24, v7
@@ -12145,7 +12198,7 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB8_5
 ; ALIGNED-NEXT:  .LBB8_6: ; %Flow19
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s6
-; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x2f
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
@@ -12592,11 +12645,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-LABEL: memmove_p0_p5_sz2048:
 ; ALIGNED:       ; %bb.0: ; %entry
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
-; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
 ; ALIGNED-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
@@ -12645,29 +12693,34 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v126, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT:    s_mov_b32 s6, exec_lo
 ; ALIGNED-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; ALIGNED-NEXT:    v_cmpx_ge_u32_e64 v2, v0
 ; ALIGNED-NEXT:    s_xor_b32 s6, exec_lo, s6
 ; ALIGNED-NEXT:    s_cbranch_execz .LBB9_2
 ; ALIGNED-NEXT:  .LBB9_1: ; %memmove_fwd_loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    s_clause 0x39
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:20
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:23
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25
 ; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37
@@ -12689,17 +12742,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57
 ; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63
 ; ALIGNED-NEXT:    buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:69
@@ -12707,94 +12760,58 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_clause 0x30
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
 ; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
 ; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
+; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v5
-; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v9, 8, v8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
+; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v27, 8, v25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
-; ALIGNED-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
+; ALIGNED-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
@@ -12802,81 +12819,82 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v32, 8, v34
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v36, 8, v31
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v35, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v48, 8, v37
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v50, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v51, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 16, v4
-; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v54, 8, v53
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v55, 8, v65
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v64
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v68, 8, v67
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
 ; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:74
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
@@ -12884,97 +12902,47 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0xc
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
-; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(32)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v3
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:80
-; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
-; ALIGNED-NEXT:    v_lshl_or_b32 v95, v16, 8, v20
-; ALIGNED-NEXT:    s_waitcnt vmcnt(12)
-; ALIGNED-NEXT:    v_lshl_or_b32 v109, v11, 8, v12
-; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1016 ; 4-byte Folded Spill
@@ -13246,158 +13214,289 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v4, 8, v7
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v114, 8, v113
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
-; ALIGNED-NEXT:    s_clause 0x3
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v3
-; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v117, 8, v40
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v121
-; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v115, 8, v118
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v112, 8, v116
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v122, 8, v123
-; ALIGNED-NEXT:    v_lshl_or_b32 v93, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v101, 8, v102
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v121
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v105, 8, v104
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v99, 8, v100
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v108, 8, v120
-; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v98
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v97, 8, v87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v105, 8, v104
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v94, 8, v107
-; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v85
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v106, 8, v111
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v83
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v88, 8, v90
-; ALIGNED-NEXT:    v_lshl_or_b32 v58, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v84
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v66, 8, v65
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v89, 8, v92
-; ALIGNED-NEXT:    v_lshl_or_b32 v41, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v82
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v88, 8, v90
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v75
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v64, 8, v68
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 8, v79
-; ALIGNED-NEXT:    v_lshl_or_b32 v119, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v52
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v78, 8, v75
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v61
-; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v50
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v36, 8, v37
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v60, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v86, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v49
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v35, 8, v38
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v59, 8, v62
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v63, 8, v73
-; ALIGNED-NEXT:    v_lshl_or_b32 v70, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v34
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v32
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v56
-; ALIGNED-NEXT:    v_lshl_or_b32 v55, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v25
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v47, 8, v57
-; ALIGNED-NEXT:    v_lshl_or_b32 v53, v4, 16, v3
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v22, 8, v24
-; ALIGNED-NEXT:    v_lshl_or_b32 v4, v18, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v56
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v44, 8, v42
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    s_clause 0x5
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v117, 8, v40
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v114, 8, v113
+; ALIGNED-NEXT:    v_lshl_or_b32 v110, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v115, 8, v118
+; ALIGNED-NEXT:    s_waitcnt vmcnt(61)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v112, 8, v116
+; ALIGNED-NEXT:    v_lshl_or_b32 v93, v4, 16, v3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(59)
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v101, 8, v102
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v99, 8, v100
+; ALIGNED-NEXT:    s_waitcnt vmcnt(13)
+; ALIGNED-NEXT:    v_lshl_or_b32 v95, v16, 8, v20
+; ALIGNED-NEXT:    s_waitcnt vmcnt(9)
+; ALIGNED-NEXT:    v_lshl_or_b32 v109, v11, 8, v12
+; ALIGNED-NEXT:    v_lshl_or_b32 v91, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v96, 8, v98
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v97, 8, v87
+; ALIGNED-NEXT:    v_lshl_or_b32 v77, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v71, 8, v85
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v69, 8, v83
+; ALIGNED-NEXT:    v_lshl_or_b32 v58, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v81, 8, v84
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v66, 8, v65
+; ALIGNED-NEXT:    v_lshl_or_b32 v41, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v67, 8, v82
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v64, 8, v68
+; ALIGNED-NEXT:    v_lshl_or_b32 v119, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v80, 8, v52
+; ALIGNED-NEXT:    v_lshl_or_b32 v103, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v48, 8, v50
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v36, 8, v37
+; ALIGNED-NEXT:    v_lshl_or_b32 v86, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v39, 8, v49
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v35, 8, v38
+; ALIGNED-NEXT:    v_lshl_or_b32 v70, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v33, 8, v34
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v30, 8, v32
+; ALIGNED-NEXT:    v_lshl_or_b32 v55, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v27, 8, v28
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v29, 8, v25
+; ALIGNED-NEXT:    v_lshl_or_b32 v53, v4, 16, v3
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v22, 8, v24
+; ALIGNED-NEXT:    v_lshl_or_b32 v4, v18, 8, v17
 ; ALIGNED-NEXT:    v_lshl_or_b32 v31, v4, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v4, v19, 8, v23
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v45
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
 ; ALIGNED-NEXT:    v_lshl_or_b32 v26, v95, 16, v4
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v13, 8, v14
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2
 ; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v21, v109, 16, v95
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v8, 8, v10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v9, 8, v7
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v95, 8, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v5, 8, v125
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v4, 8, v6
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11
-; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v6
 ; ALIGNED-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v109, v0, 8, v1
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
@@ -13410,6 +13509,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v95, v4, 8, v0
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v109, 16, v95
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16
@@ -13418,11 +13518,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:236
 ; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:224
-; ALIGNED-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:704
 ; ALIGNED-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:708
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v2, 0x100, v2
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v127, 8, v109
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
@@ -13491,8 +13590,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v91, off, s[0:3], s32 offset:220
 ; ALIGNED-NEXT:    buffer_store_dword v93, off, s[0:3], s32 offset:212
 ; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:208
-; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v87 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v97 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v96 offset:201
@@ -13509,6 +13606,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v116 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v118 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v40 offset:192
+; ALIGNED-NEXT:    v_lshl_or_b32 v127, v0, 16, v127
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v42 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v44 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v43 offset:185
@@ -13525,6 +13638,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v62 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v73 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v74 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v75 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v78 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v76 offset:169
@@ -13541,36 +13666,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v107 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v111 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v120 offset:160
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:154
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:153
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:159
-; ALIGNED-NEXT:    s_add_u32 s4, s4, 0x100
-; ALIGNED-NEXT:    s_addc_u32 s5, s5, 0
-; ALIGNED-NEXT:    s_cmp_lg_u64 s[4:5], 0x800
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:296
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:300
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:292
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:288
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:312
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:316
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:308
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:304
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:264
@@ -13583,6 +13678,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:256
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v121 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v124 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v122 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v126 offset:159
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:157
@@ -14048,8 +14147,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:388
 ; ALIGNED-NEXT:    buffer_store_dword v127, off, s[0:3], s32 offset:384
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:10
-; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:11
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:26
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
@@ -14103,6 +14200,8 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:400
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v1 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[3:4], v5 offset:11
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[3:4], v0 offset:13
@@ -14154,23 +14253,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_mov_b32 s7, -1
 ; ALIGNED-NEXT:  .LBB9_4: ; %memmove_bwd_loop
 ; ALIGNED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    s_clause 0x39
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
 ; ALIGNED-NEXT:    buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24
-; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26
-; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32
 ; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33
 ; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27
 ; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35
 ; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36
 ; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37
@@ -14192,17 +14291,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53
 ; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54
 ; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
 ; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60
 ; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61
 ; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62
 ; ALIGNED-NEXT:    buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63
-; ALIGNED-NEXT:    buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64
 ; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65
 ; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59
 ; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67
 ; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68
 ; ALIGNED-NEXT:    buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69
@@ -14210,97 +14309,57 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71
 ; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76
 ; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
 ; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78
 ; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79
-; ALIGNED-NEXT:    buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75
-; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
-; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
-; ALIGNED-NEXT:    buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
-; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
-; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
-; ALIGNED-NEXT:    s_clause 0x34
-; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
-; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
-; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
-; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
-; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
-; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
-; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
-; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
-; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
-; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
-; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
-; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
-; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
-; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
-; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
-; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
-; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
-; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
-; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
-; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
-; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
-; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
-; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
-; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
-; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
-; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
-; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
-; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
-; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
-; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
-; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
-; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
-; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
-; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
-; ALIGNED-NEXT:    buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
-; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
-; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
-; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
-; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
-; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
-; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
-; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
-; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
-; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
-; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
-; ALIGNED-NEXT:    buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
-; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
-; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(57)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(56)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(55)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(54)
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(53)
+; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(52)
+; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
+; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
 ; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
 ; ALIGNED-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(45)
 ; ALIGNED-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v7, 8, v5
+; ALIGNED-NEXT:    s_waitcnt vmcnt(42)
+; ALIGNED-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v9, 8, v8
+; ALIGNED-NEXT:    s_waitcnt vmcnt(40)
 ; ALIGNED-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v10, 8, v6
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v12
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
 ; ALIGNED-NEXT:    v_lshl_or_b32 v8, v19, 8, v17
+; ALIGNED-NEXT:    s_waitcnt vmcnt(39)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v9, v16, 8, v13
+; ALIGNED-NEXT:    s_waitcnt vmcnt(37)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v10, v20, 8, v18
+; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; ALIGNED-NEXT:    s_waitcnt vmcnt(33)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v12, v28, 8, v25
+; ALIGNED-NEXT:    s_waitcnt vmcnt(31)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v13, v24, 8, v21
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(29)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v14, v27, 8, v26
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
@@ -14309,88 +14368,75 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
 ; ALIGNED-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(27)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v15, v31, 8, v30
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(25)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v34, 8, v33
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(23)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v37, 8, v32
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(21)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v36, 8, v35
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(16)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v50, 8, v38
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v5, v49, 8, v39
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v6, v51, 8, v48
 ; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v7, v53, 8, v52
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v0, 16, v15
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v5, 16, v3
-; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85
-; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v55, 8, v29
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v67, 8, v66
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(8)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v64, 8, v54
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v68, 8, v65
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
 ; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86
-; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 16, v2
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v70, 8, v69
 ; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
 ; ALIGNED-NEXT:    buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v80, 8, v71
-; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_clause 0x5
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73
-; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
-; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
-; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
-; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
-; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
-; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72
-; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
-; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
-; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
-; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
-; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
-; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
-; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
-; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
-; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
-; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
-; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill
@@ -14399,83 +14445,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
-; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
-; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
-; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
-; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
-; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
-; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
-; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
-; ALIGNED-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x4
-; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
-; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
-; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
-; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
-; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
-; ALIGNED-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
-; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
-; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
-; ALIGNED-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x7
-; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
-; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
-; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
-; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
-; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
-; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
-; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
-; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
-; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(51)
-; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(50)
-; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(49)
-; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(48)
-; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(47)
+; ALIGNED-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v66, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v67, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v64, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v65, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v68, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v69, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
 ; ALIGNED-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(46)
-; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(44)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
 ; ALIGNED-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(43)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(5)
+; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1004 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(41)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(35)
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v81, 8, v2
 ; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84
-; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v12, 8, v16
+; ALIGNED-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1024 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
@@ -14748,126 +14763,259 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v7
-; ALIGNED-NEXT:    v_lshl_or_b32 v3, v100, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; ALIGNED-NEXT:    v_lshl_or_b32 v1, v8, 8, v6
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x1
+; ALIGNED-NEXT:    buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT:    buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v2
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT:    buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT:    buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v123
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT:    buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT:    buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT:    buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT:    buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT:    buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT:    buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT:    buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v93
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v91, 8, v94
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v107
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT:    buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT:    buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT:    buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT:    buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v77, 8, v78
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT:    buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT:    buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x7
+; ALIGNED-NEXT:    buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT:    buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT:    buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT:    buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT:    buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT:    buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT:    buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT:    buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT:    s_waitcnt vmcnt(6)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
+; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v47
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v58
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v60
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x4
+; ALIGNED-NEXT:    buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT:    buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT:    buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT:    buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT:    buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v44
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v41, 8, v42
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT:    buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT:    buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v118, 8, v119
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v117
+; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3e
+; ALIGNED-NEXT:    buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT:    buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT:    buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT:    buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT:    buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT:    buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT:    buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT:    buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT:    buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT:    buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT:    buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT:    buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT:    buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT:    buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT:    buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT:    buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT:    buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT:    buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT:    buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT:    buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT:    buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT:    buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT:    buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT:    buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT:    buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT:    buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT:    buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT:    buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT:    buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT:    buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT:    buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT:    buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT:    buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT:    buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT:    buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT:    buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT:    buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT:    buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT:    buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT:    buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT:    buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT:    buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT:    buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT:    buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT:    buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT:    buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT:    buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT:    buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT:    buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT:    buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT:    buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT:    buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT:    buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT:    buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT:    buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT:    buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247
 ; ALIGNED-NEXT:    buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252
 ; ALIGNED-NEXT:    buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253
 ; ALIGNED-NEXT:    buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254
 ; ALIGNED-NEXT:    buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255
 ; ALIGNED-NEXT:    buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v5, 8, v2
-; ALIGNED-NEXT:    buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156
-; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
-; ALIGNED-NEXT:    v_lshl_or_b32 v76, v8, 8, v10
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
-; ALIGNED-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v126, 8, v2
+; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT:    s_clause 0x6
+; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT:    s_waitcnt vmcnt(62)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v112, 8, v115
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v124, 8, v111
+; ALIGNED-NEXT:    v_lshl_or_b32 v3, v100, 8, v101
 ; ALIGNED-NEXT:    v_lshl_or_b32 v106, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v103, 8, v113
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v99, 8, v102
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v121, 8, v123
 ; ALIGNED-NEXT:    v_lshl_or_b32 v90, v3, 16, v2
+; ALIGNED-NEXT:    s_waitcnt vmcnt(60)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v87, 8, v97
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v92, 8, v93
+; ALIGNED-NEXT:    s_waitcnt vmcnt(58)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v86, 8, v96
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v105, 8, v108
+; ALIGNED-NEXT:    s_waitcnt vmcnt(14)
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v12, 8, v16
+; ALIGNED-NEXT:    s_waitcnt vmcnt(10)
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v8, 8, v10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v88, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v83, 8, v84
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v85, 8, v82
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v91, 8, v94
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v73, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v68, 8, v80
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v95, 8, v107
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v65, 8, v70
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v77, 8, v78
 ; ALIGNED-NEXT:    v_lshl_or_b32 v45, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v67, 8, v71
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v52, 8, v53
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v79, 8, v89
 ; ALIGNED-NEXT:    v_lshl_or_b32 v116, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v55, 8, v69
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v75, 8, v63
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v51, 8, v54
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v72, 8, v74
 ; ALIGNED-NEXT:    v_lshl_or_b32 v114, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v38, 8, v50
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v66, 8, v39
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v56, 8, v47
 ; ALIGNED-NEXT:    v_lshl_or_b32 v98, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v35, 8, v37
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v59, 8, v61
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v32, 8, v31
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v46, 8, v58
 ; ALIGNED-NEXT:    v_lshl_or_b32 v81, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v33, 8, v36
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v30, 8, v34
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v57, 8, v60
 ; ALIGNED-NEXT:    v_lshl_or_b32 v64, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v27, 8, v29
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v41, 8, v42
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v26, 8, v28
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v43, 8, v44
 ; ALIGNED-NEXT:    v_lshl_or_b32 v49, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v22, 8, v24
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v23, 8, v21
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    v_lshl_or_b32 v1, v40, 8, v117
 ; ALIGNED-NEXT:    v_lshl_or_b32 v48, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v17, 8, v19
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v118, 8, v119
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
-; ALIGNED-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248
-; ALIGNED-NEXT:    buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249
-; ALIGNED-NEXT:    buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250
 ; ALIGNED-NEXT:    v_lshl_or_b32 v25, v3, 16, v2
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v15, 8, v18
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen
 ; ALIGNED-NEXT:    v_lshl_or_b32 v20, v62, 16, v3
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v9, 8, v11
 ; ALIGNED-NEXT:    v_lshl_or_b32 v3, v76, 16, v62
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v5, 8, v6
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v7, 8, v1
 ; ALIGNED-NEXT:    v_lshl_or_b32 v2, v76, 16, v62
 ; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1
 ; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3
-; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
@@ -14879,34 +15027,36 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v109, 8, v104
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v122, 8, v110
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
-; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_clause 0x2
-; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8
-; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9
-; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
-; ALIGNED-NEXT:    s_clause 0x1
-; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
-; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
-; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12
-; ALIGNED-NEXT:    s_waitcnt vmcnt(7)
-; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
-; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
-; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x3
+; ALIGNED-NEXT:    buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT:    buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT:    buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT:    s_waitcnt vmcnt(3)
+; ALIGNED-NEXT:    v_lshl_or_b32 v62, v127, 8, v0
+; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
+; ALIGNED-NEXT:    buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
+; ALIGNED-NEXT:    v_lshl_or_b32 v76, v104, 8, v76
+; ALIGNED-NEXT:    buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
+; ALIGNED-NEXT:    s_clause 0x2
+; ALIGNED-NEXT:    buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT:    buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT:    buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT:    s_waitcnt vmcnt(1)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v62, v110, 8, v122
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v76, v120, 8, v109
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v76, 16, v62
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_clause 0x2
 ; ALIGNED-NEXT:    buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18
 ; ALIGNED-NEXT:    buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16
@@ -14915,11 +15065,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:492
 ; ALIGNED-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:484
 ; ALIGNED-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:480
-; ALIGNED-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x1
 ; ALIGNED-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704
 ; ALIGNED-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:708
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v4, 0xffffff00, v4
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(4)
 ; ALIGNED-NEXT:    v_lshl_or_b32 v0, v125, 8, v62
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(2)
@@ -14988,8 +15137,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v88, off, s[0:3], s32 offset:476
 ; ALIGNED-NEXT:    buffer_store_dword v90, off, s[0:3], s32 offset:468
 ; ALIGNED-NEXT:    buffer_store_dword v106, off, s[0:3], s32 offset:464
-; ALIGNED-NEXT:    v_lshl_or_b32 v125, v0, 16, v125
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v82 offset:202
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v85 offset:203
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v83 offset:201
@@ -15006,6 +15153,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v102 offset:198
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v113 offset:196
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v115 offset:192
+; ALIGNED-NEXT:    v_lshl_or_b32 v125, v0, 16, v125
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
+; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
+; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v117 offset:186
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v40 offset:187
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v118 offset:185
@@ -15022,6 +15185,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v58 offset:182
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v60 offset:180
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v61 offset:176
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564
+; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
+; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v63 offset:170
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v75 offset:171
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v72 offset:169
@@ -15038,36 +15213,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v94 offset:166
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v107 offset:164
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v108 offset:160
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v111 offset:154
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v124 offset:155
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v121 offset:153
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v126 offset:159
-; ALIGNED-NEXT:    s_add_u32 s4, s4, 0xffffff00
-; ALIGNED-NEXT:    s_addc_u32 s5, s5, -1
-; ALIGNED-NEXT:    s_cmp_eq_u64 s[4:5], s[6:7]
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:552
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:556
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:548
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:544
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:568
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:572
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:564
-; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
-; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:560
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:520
@@ -15080,6 +15225,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:512
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v111 offset:154
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v124 offset:155
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v121 offset:153
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v126 offset:159
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:157
@@ -15545,10 +15694,6 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:644
 ; ALIGNED-NEXT:    buffer_store_dword v125, off, s[0:3], s32 offset:640
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v109 offset:10
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v120 offset:11
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v127 offset:13
-; ALIGNED-NEXT:    flat_store_byte v[2:3], v110 offset:9
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:26
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload
@@ -15602,6 +15747,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:656
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v109 offset:10
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v120 offset:11
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v127 offset:13
+; ALIGNED-NEXT:    flat_store_byte v[2:3], v110 offset:9
 ; ALIGNED-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload
 ; ALIGNED-NEXT:    s_waitcnt vmcnt(0)
 ; ALIGNED-NEXT:    flat_store_byte v[2:3], v0 offset:15
@@ -15639,7 +15788,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_cbranch_scc0 .LBB9_4
 ; ALIGNED-NEXT:  .LBB9_5: ; %Flow11
 ; ALIGNED-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; ALIGNED-NEXT:    s_clause 0x2f ; 192-byte Folded Reload
+; ALIGNED-NEXT:    s_clause 0x2f
 ; ALIGNED-NEXT:    buffer_load_dword v127, off, s[0:3], s32
 ; ALIGNED-NEXT:    buffer_load_dword v126, off, s[0:3], s32 offset:4
 ; ALIGNED-NEXT:    buffer_load_dword v125, off, s[0:3], s32 offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
index 32800488f0633..71900a4d1c1e4 100644
--- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll
@@ -90,19 +90,19 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh
 ; GFX12-GISEL-NEXT:    s_load_b256 s[20:27], s[2:3], 0x40
 ; GFX12-GISEL-NEXT:    s_load_b512 s[36:51], s[2:3], 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v2, off, s[16:19], null
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v3, off, s[20:23], null
 ; GFX12-GISEL-NEXT:    buffer_load_b32 v4, off, s[40:43], null
-; GFX12-GISEL-NEXT:    image_sample_lz v1, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x2
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0xac0, v2
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
+; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x1
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s1, 0xac0, v3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_u32_e64 s2, 0xac0, v4
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x1
-; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 1.0, v1
 ; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, vcc_lo
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 1177474f5b4f5..78207c2cf605e 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -185,47 +185,44 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v4, s43, v4
 ; GFX900-NEXT:    v_add_f32_e32 v3, s42, v3
 ; GFX900-NEXT:    v_add_f32_e32 v2, s41, v2
 ; GFX900-NEXT:    v_add_f32_e32 v1, s40, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v32, s19, v32
 ; GFX900-NEXT:    v_add_f32_e32 v31, s18, v31
 ; GFX900-NEXT:    v_add_f32_e32 v30, s17, v30
 ; GFX900-NEXT:    v_add_f32_e32 v29, s16, v29
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
-; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
-; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
 ; GFX900-NEXT:    v_add_f32_e32 v5, s36, v5
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_add_f32_e32 v12, s51, v12
 ; GFX900-NEXT:    v_add_f32_e32 v11, s50, v11
 ; GFX900-NEXT:    v_add_f32_e32 v10, s49, v10
 ; GFX900-NEXT:    v_add_f32_e32 v9, s48, v9
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_add_f32_e32 v16, s47, v16
 ; GFX900-NEXT:    v_add_f32_e32 v15, s46, v15
 ; GFX900-NEXT:    v_add_f32_e32 v14, s45, v14
 ; GFX900-NEXT:    v_add_f32_e32 v13, s44, v13
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_add_f32_e32 v20, s15, v20
 ; GFX900-NEXT:    v_add_f32_e32 v19, s14, v19
 ; GFX900-NEXT:    v_add_f32_e32 v18, s13, v18
 ; GFX900-NEXT:    v_add_f32_e32 v17, s12, v17
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_f32_e32 v24, s11, v24
 ; GFX900-NEXT:    v_add_f32_e32 v23, s10, v23
 ; GFX900-NEXT:    v_add_f32_e32 v22, s9, v22
@@ -249,8 +246,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -260,7 +255,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -296,8 +293,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -307,7 +302,9 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -343,14 +340,11 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -360,18 +354,22 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v44, s46
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
@@ -411,9 +409,6 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -426,6 +421,10 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
@@ -1443,47 +1442,44 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v4, s43, v4
 ; GFX900-NEXT:    v_mul_f32_e32 v3, s42, v3
 ; GFX900-NEXT:    v_mul_f32_e32 v2, s41, v2
 ; GFX900-NEXT:    v_mul_f32_e32 v1, s40, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_mul_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_mul_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_mul_f32_e32 v6, s37, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v32, s19, v32
 ; GFX900-NEXT:    v_mul_f32_e32 v31, s18, v31
 ; GFX900-NEXT:    v_mul_f32_e32 v30, s17, v30
 ; GFX900-NEXT:    v_mul_f32_e32 v29, s16, v29
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_mul_f32_e32 v8, s39, v8
-; GFX900-NEXT:    v_mul_f32_e32 v7, s38, v7
-; GFX900-NEXT:    v_mul_f32_e32 v6, s37, v6
 ; GFX900-NEXT:    v_mul_f32_e32 v5, s36, v5
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_mul_f32_e32 v12, s51, v12
 ; GFX900-NEXT:    v_mul_f32_e32 v11, s50, v11
 ; GFX900-NEXT:    v_mul_f32_e32 v10, s49, v10
 ; GFX900-NEXT:    v_mul_f32_e32 v9, s48, v9
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_mul_f32_e32 v16, s47, v16
 ; GFX900-NEXT:    v_mul_f32_e32 v15, s46, v15
 ; GFX900-NEXT:    v_mul_f32_e32 v14, s45, v14
 ; GFX900-NEXT:    v_mul_f32_e32 v13, s44, v13
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_mul_f32_e32 v20, s15, v20
 ; GFX900-NEXT:    v_mul_f32_e32 v19, s14, v19
 ; GFX900-NEXT:    v_mul_f32_e32 v18, s13, v18
 ; GFX900-NEXT:    v_mul_f32_e32 v17, s12, v17
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mul_f32_e32 v24, s11, v24
 ; GFX900-NEXT:    v_mul_f32_e32 v23, s10, v23
 ; GFX900-NEXT:    v_mul_f32_e32 v22, s9, v22
@@ -1507,8 +1503,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -1518,7 +1512,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -1554,8 +1550,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -1565,7 +1559,9 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -1601,14 +1597,11 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v56, 7, v0 :: v_dual_mov_b32 v32, s40
+; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
@@ -1618,18 +1611,22 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v33, s41 :: v_dual_mov_b32 v34, s42
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v35, s43 :: v_dual_mov_b32 v36, s38
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v39, s49 :: v_dual_mov_b32 v40, s50
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v41, s51 :: v_dual_mov_b32 v42, s44
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v37, s39 :: v_dual_mov_b32 v38, s48
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v55, s23 :: v_dual_mov_b32 v51, s11
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v52, s20 :: v_dual_mov_b32 v53, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v54, s22 :: v_dual_mov_b32 v49, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v45, s47
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s13
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v43, s45
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v44, s46
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
@@ -1669,9 +1666,6 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1684,6 +1678,10 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
@@ -2275,47 +2273,44 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
-; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
 ; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
 ; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
 ; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
 ; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
 ; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v4, v4, s43, s43
 ; GFX900-NEXT:    v_fma_f32 v3, v3, s42, s42
 ; GFX900-NEXT:    v_fma_f32 v2, v2, s41, s41
 ; GFX900-NEXT:    v_fma_f32 v1, v1, s40, s40
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_fma_f32 v8, v8, s39, s39
+; GFX900-NEXT:    v_fma_f32 v7, v7, s38, s38
+; GFX900-NEXT:    v_fma_f32 v6, v6, s37, s37
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v32, v32, s19, s19
 ; GFX900-NEXT:    v_fma_f32 v31, v31, s18, s18
 ; GFX900-NEXT:    v_fma_f32 v30, v30, s17, s17
 ; GFX900-NEXT:    v_fma_f32 v29, v29, s16, s16
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_fma_f32 v8, v8, s39, s39
-; GFX900-NEXT:    v_fma_f32 v7, v7, s38, s38
-; GFX900-NEXT:    v_fma_f32 v6, v6, s37, s37
 ; GFX900-NEXT:    v_fma_f32 v5, v5, s36, s36
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_fma_f32 v12, v12, s51, s51
 ; GFX900-NEXT:    v_fma_f32 v11, v11, s50, s50
 ; GFX900-NEXT:    v_fma_f32 v10, v10, s49, s49
 ; GFX900-NEXT:    v_fma_f32 v9, v9, s48, s48
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
 ; GFX900-NEXT:    v_fma_f32 v16, v16, s47, s47
 ; GFX900-NEXT:    v_fma_f32 v15, v15, s46, s46
 ; GFX900-NEXT:    v_fma_f32 v14, v14, s45, s45
 ; GFX900-NEXT:    v_fma_f32 v13, v13, s44, s44
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
 ; GFX900-NEXT:    v_fma_f32 v20, v20, s15, s15
 ; GFX900-NEXT:    v_fma_f32 v19, v19, s14, s14
 ; GFX900-NEXT:    v_fma_f32 v18, v18, s13, s13
 ; GFX900-NEXT:    v_fma_f32 v17, v17, s12, s12
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_fma_f32 v24, v24, s11, s11
 ; GFX900-NEXT:    v_fma_f32 v23, v23, s10, s10
 ; GFX900-NEXT:    v_fma_f32 v22, v22, s9, s9
@@ -2339,8 +2334,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
@@ -2350,7 +2343,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
 ; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
-; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7)
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[40:41], s[40:41]
 ; PACKED-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[42:43], s[42:43]
 ; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
@@ -2386,8 +2381,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
-; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
 ; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
@@ -2397,7 +2390,9 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
 ; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
-; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
 ; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], s[36:37], s[36:37]
 ; PACKED-GISEL-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], s[38:39], s[38:39]
 ; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
@@ -2435,9 +2430,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_clause 0x1
-; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2450,6 +2442,10 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
 ; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[40:41]
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[42:43]
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[50:51]
@@ -2500,9 +2496,6 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    s_clause 0x1
-; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -2515,6 +2508,10 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
 ; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
index 025d9e63436d7..d0d5cc11994af 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir
@@ -56,11 +56,11 @@ body:             |
     ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
     ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (load (s32)) {
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr2, implicit-def $vgpr3, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
     ; GCN-NEXT:   $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
     ; GCN-NEXT:   $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load (s32))
     ; GCN-NEXT: }
-    ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (store (s128)) {
+    ; GCN-NEXT: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
     ; GCN-NEXT:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
     ; GCN-NEXT:   IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store (s128))
     ; GCN-NEXT: }
@@ -359,7 +359,6 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABLE: name: no_sched_barrier_within_bundle
-    ; GCN-LABEL: name: no_sched_barrier_within_bundle
     ; GCN: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
diff --git a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
index e0266b9f1a5b0..5fea0aee72ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/postra-bundle-vimage-vsample-gfx12.mir
@@ -9,7 +9,7 @@ body:             |
     ; GFX12-LABEL: name: post_bundle_vimage
     ; GFX12: liveins: $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8) {
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr5, implicit-def $vgpr4, implicit killed $vgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec {
     ; GFX12-NEXT:   $vgpr5 = IMAGE_LOAD_V1_V1_gfx12 $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
     ; GFX12-NEXT:   $vgpr4 = IMAGE_LOAD_V1_V1_gfx12 killed $vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, -1, 0, 0, implicit $exec :: (dereferenceable invariant load (s32), addrspace 8)
     ; GFX12-NEXT: }
@@ -25,7 +25,7 @@ body:             |
     ; GFX12-LABEL: name: post_bundle_vsample
     ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX12-NEXT: {{  $}}
-    ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 :: (dereferenceable load (s128), addrspace 8) {
+    ; GFX12-NEXT: BUNDLE implicit-def $vgpr6_vgpr7_vgpr8_vgpr9, implicit-def $vgpr10_vgpr11_vgpr12_vgpr13, implicit killed $vgpr0, implicit killed $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec, implicit killed $vgpr2, implicit killed $vgpr3 {
     ; GFX12-NEXT:   $vgpr6_vgpr7_vgpr8_vgpr9 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
     ; GFX12-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_SAMPLE_V4_V2_gfx12 killed $vgpr2, killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
     ; GFX12-NEXT: }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index b91bdd2b2fa71..85a9aba1a0e51 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -398,11 +398,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xffffc800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, -1, v3, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xffffd000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffd800, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v3, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[6:7], v[6:7]
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v3, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[4:5]
@@ -514,8 +514,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX900-NEXT:    v_add_co_u32_e32 v8, vcc, 0xffffb000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
+; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
 ; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v2
+; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
 ; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
@@ -524,15 +526,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v14, vcc, s3, v2
 ; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[16:17], off offset:-2048
 ; GFX900-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[2:3], off offset:-4096
-; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[2:3], off offset:-2048
 ; GFX900-NEXT:    s_addk_i32 s5, 0x2000
 ; GFX900-NEXT:    s_cmp_gt_u32 s5, 0x3fffff
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_add_co_u32_e32 v22, vcc, v8, v4
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[14:15], off offset:-4096
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
+; GFX900-NEXT:    s_waitcnt vmcnt(3)
 ; GFX900-NEXT:    v_add_co_u32_e64 v24, s[0:1], v18, v22
 ; GFX900-NEXT:    v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1]
 ; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
@@ -540,13 +540,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s4, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v3, vcc
 ; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[4:5], off offset:-2048
-; GFX900-NEXT:    s_waitcnt vmcnt(7)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_co_u32_e32 v20, vcc, v20, v24
 ; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[2:3], off
 ; GFX900-NEXT:    v_addc_co_u32_e32 v21, vcc, v21, v25, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x10000, v2
 ; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(7)
+; GFX900-NEXT:    s_waitcnt vmcnt(5)
 ; GFX900-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v20
 ; GFX900-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v21, vcc
 ; GFX900-NEXT:    s_waitcnt vmcnt(4)
@@ -734,8 +734,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
+; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
 ; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, 0xffffc000, v6
+; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[12:13], off
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v7, vcc
 ; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[14:15], off offset:-2048
 ; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[14:15], off
@@ -751,42 +753,39 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
 ; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[22:23], off offset:-2048
 ; GFX90A-NEXT:    global_load_dwordx2 v[30:31], v[6:7], off
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[6:7], off offset:-2048
 ; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, 0x10000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX90A-NEXT:    s_addk_i32 s3, 0x2000
 ; GFX90A-NEXT:    s_cmp_gt_u32 s3, 0x3fffff
-; GFX90A-NEXT:    s_waitcnt vmcnt(10)
+; GFX90A-NEXT:    s_waitcnt vmcnt(8)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v13, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(9)
+; GFX90A-NEXT:    s_waitcnt vmcnt(7)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v18, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v19, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(8)
+; GFX90A-NEXT:    s_waitcnt vmcnt(6)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v20, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v21, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(7)
+; GFX90A-NEXT:    s_waitcnt vmcnt(5)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v16, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v17, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
+; GFX90A-NEXT:    s_waitcnt vmcnt(4)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v24, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v25, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(5)
+; GFX90A-NEXT:    s_waitcnt vmcnt(3)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v26, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v27, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
+; GFX90A-NEXT:    s_waitcnt vmcnt(2)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v28, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v29, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(3)
+; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v14, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v11, v5, vcc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index fb9c47731eb42..7a3bff8aed56e 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -28,29 +28,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -58,8 +44,27 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -69,40 +74,49 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -111,19 +125,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -138,7 +139,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -153,70 +153,84 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -224,19 +238,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -251,7 +252,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -266,69 +266,83 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -336,18 +350,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -362,8 +364,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -392,6 +392,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -413,6 +414,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -420,23 +422,24 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -444,8 +447,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -453,9 +457,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -474,8 +475,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -504,6 +503,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -525,6 +525,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -532,23 +533,24 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -556,8 +558,9 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -565,9 +568,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -586,8 +586,6 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -976,43 +974,42 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -1027,7 +1024,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -1053,29 +1051,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -1083,8 +1067,27 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -1094,40 +1097,49 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1136,19 +1148,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -1163,7 +1162,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1178,70 +1176,84 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -1249,19 +1261,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -1276,7 +1275,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1291,69 +1289,83 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -1361,18 +1373,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -1387,8 +1387,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1417,6 +1415,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1438,6 +1437,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1445,23 +1445,24 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -1469,8 +1470,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1478,9 +1480,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1499,8 +1498,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -1529,6 +1526,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -1550,6 +1548,7 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -1557,23 +1556,24 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -1581,8 +1581,9 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -1590,9 +1591,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -1611,8 +1609,6 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2001,43 +1997,42 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -2052,7 +2047,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -2078,29 +2074,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -2108,8 +2090,27 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -2119,61 +2120,57 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
-; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:772
-; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:772
+; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -2188,7 +2185,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2203,70 +2199,84 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -2274,19 +2284,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -2301,7 +2298,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2316,69 +2312,83 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s4
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -2386,18 +2396,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -2412,8 +2410,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2442,6 +2438,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2463,6 +2460,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2470,23 +2468,24 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -2494,8 +2493,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2503,9 +2503,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2524,8 +2521,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -2554,6 +2549,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -2575,6 +2571,7 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -2582,23 +2579,24 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -2606,8 +2604,9 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -2615,9 +2614,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -2636,8 +2632,6 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3026,43 +3020,42 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -3077,7 +3070,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -3103,29 +3097,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -3133,8 +3113,27 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -3144,40 +3143,49 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3186,19 +3194,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -3213,7 +3208,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3228,70 +3222,84 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -3299,19 +3307,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -3326,7 +3321,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3340,69 +3334,83 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -3410,18 +3418,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -3436,8 +3432,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3465,6 +3459,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3486,6 +3481,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3493,23 +3489,24 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -3517,8 +3514,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3526,9 +3524,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3547,8 +3542,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -3576,6 +3569,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -3597,6 +3591,7 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -3604,23 +3599,24 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -3628,8 +3624,9 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -3637,9 +3634,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -3658,8 +3652,6 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4048,43 +4040,42 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -4099,7 +4090,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -4125,29 +4117,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    s_mov_b32 s7, 0xe8f000
 ; SI-NEXT:    s_add_u32 s4, s4, s0
 ; SI-NEXT:    s_addc_u32 s5, s5, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
@@ -4155,8 +4133,27 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -4166,40 +4163,49 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4208,19 +4214,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -4235,7 +4228,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; SI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4250,70 +4242,84 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    s_mov_b32 s7, 0xe80000
 ; VI-NEXT:    s_add_u32 s4, s4, s0
 ; VI-NEXT:    s_addc_u32 s5, s5, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:200
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:220
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:776
@@ -4321,19 +4327,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[4:7], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[4:7], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[4:7], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[4:7], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[4:7], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[4:7], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[4:7], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0 offset:752
@@ -4348,7 +4341,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; VI-NEXT:    buffer_store_dword v7, off, s[4:7], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[4:7], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4362,69 +4354,83 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s3, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s0, s0, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
@@ -4432,18 +4438,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:752
@@ -4458,8 +4452,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4487,6 +4479,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4508,6 +4501,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4515,23 +4509,24 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -4539,8 +4534,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4548,9 +4544,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4569,8 +4562,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -4598,6 +4589,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:304
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
@@ -4619,6 +4611,7 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:240
@@ -4626,23 +4619,24 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:824
@@ -4650,8 +4644,9 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[0:3], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:792
@@ -4659,9 +4654,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[0:3], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:296
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[0:3], 0 offset:768
@@ -4680,8 +4672,6 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[0:3], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
@@ -5070,43 +5060,42 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v7, 0x3f523be1
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v15, 0x3e319356
 ; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v1, v0
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v25, 0x3f20e7f5
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, 0x3efcd89c
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -5121,7 +5110,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -5151,29 +5141,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    s_mov_b32 s11, 0xe8f000
 ; SI-NEXT:    s_add_u32 s8, s8, s6
 ; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -5181,8 +5157,27 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -5192,40 +5187,49 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; SI-NEXT:    s_waitcnt expcnt(3)
 ; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5234,19 +5238,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5261,7 +5252,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; SI-NEXT:    s_mov_b32 s2, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -5277,70 +5267,84 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    s_mov_b32 s11, 0xe80000
 ; VI-NEXT:    s_add_u32 s8, s8, s6
 ; VI-NEXT:    s_addc_u32 s9, s9, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
 ; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
 ; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
+; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5348,19 +5352,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5375,7 +5366,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; VI-NEXT:    s_mov_b32 s2, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -5390,69 +5380,83 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s8, s8, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -5460,18 +5464,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -5486,9 +5478,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
-; GFX9-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-MUBUF-NEXT:    ; return to shader part epilog
@@ -5500,10 +5491,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W32-MUBUF-NEXT:    s_add_u32 s8, s8, s5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W32-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -5514,6 +5505,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5535,6 +5528,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5542,25 +5536,24 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -5568,8 +5561,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5577,8 +5571,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5597,8 +5589,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -5612,10 +5602,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W64-MUBUF-NEXT:    s_add_u32 s8, s8, s5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W64-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -5626,6 +5616,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -5647,6 +5639,7 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -5654,25 +5647,24 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -5680,8 +5672,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -5689,8 +5682,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -5709,8 +5700,6 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -6104,10 +6093,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0x3f5f2ee2
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:320
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
@@ -6116,31 +6105,29 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v10, 0xb7043519
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbe31934f
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -6155,7 +6142,8 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
@@ -6184,29 +6172,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    s_mov_b32 s11, 0xe8f000
 ; SI-NEXT:    s_add_u32 s8, s8, s6
 ; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; SI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; SI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
@@ -6214,8 +6188,27 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
@@ -6225,40 +6218,49 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; SI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; SI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; SI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; SI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
-; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; SI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; SI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; SI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
+; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
+; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
+; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
+; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; SI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
+; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6267,19 +6269,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x200, v0
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; SI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; SI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; SI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; SI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; SI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; SI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; SI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; SI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; SI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; SI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6294,7 +6283,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; SI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; SI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; SI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; SI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; SI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; SI-NEXT:    s_mov_b32 s2, s5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -6310,70 +6298,84 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    s_mov_b32 s11, 0xe80000
 ; VI-NEXT:    s_add_u32 s8, s8, s6
 ; VI-NEXT:    s_addc_u32 s9, s9, 0
-; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; VI-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; VI-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; VI-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; VI-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; VI-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
+; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
+; VI-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; VI-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; VI-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; VI-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; VI-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; VI-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; VI-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
+; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6381,19 +6383,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 0x200, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; VI-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; VI-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; VI-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; VI-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; VI-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; VI-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; VI-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; VI-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; VI-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; VI-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; VI-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6408,7 +6397,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; VI-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; VI-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; VI-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; VI-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; VI-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
 ; VI-NEXT:    s_mov_b32 s2, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -6423,69 +6411,83 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    s_mov_b32 s11, 0xe00000
 ; GFX9-MUBUF-NEXT:    s_add_u32 s8, s8, s5
 ; GFX9-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbf20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
-; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f523be1
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:320
+; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v6, off, s[8:11], 0 offset:304
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:300
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:296
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:288
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:276
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:272
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:252
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v9, 0x3e319356
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v10, 0x3eae29dc
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:236
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v13, 0xbf3d349e
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT:    s_nop 0
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:832
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:828
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:824
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:820
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3703c499
-; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
-; GFX9-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:308
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:284
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v2, 0xbefcd89f
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v8, 0xbe319356
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf20e7f5
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
-; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3f3d349c
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:816
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:812
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:808
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:804
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:800
+; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v18, 0xbf523be1
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:784
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:780
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
@@ -6493,18 +6495,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:764
 ; GFX9-MUBUF-NEXT:    v_mov_b32_e32 v5, 0xbf5f2ee2
-; GFX9-MUBUF-NEXT:    v_add_u32_e32 v1, 0x200, v0
-; GFX9-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316
-; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:312
-; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:280
-; GFX9-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:268
-; GFX9-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:248
-; GFX9-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:240
-; GFX9-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:232
-; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:228
-; GFX9-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:224
-; GFX9-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
-; GFX9-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:196
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:760
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:756
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:752
@@ -6519,9 +6509,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:716
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:712
 ; GFX9-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:708
-; GFX9-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
-; GFX9-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX9-MUBUF-NEXT:    buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX9-MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-MUBUF-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-MUBUF-NEXT:    ; return to shader part epilog
@@ -6533,10 +6522,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s11, 0x31c16000
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W32-MUBUF-NEXT:    s_add_u32 s8, s8, s5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W32-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -6547,6 +6536,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6568,6 +6559,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6575,25 +6567,24 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W32-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W32-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -6601,8 +6592,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W32-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6610,8 +6602,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W32-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6630,8 +6620,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W32-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
-; GFX10_W32-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W32-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W32-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -6645,10 +6633,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s10, -1
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v1, 0xbf20e7f4
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v3, 0x3f523be1
 ; GFX10_W64-MUBUF-NEXT:    s_add_u32 s8, s8, s5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v4, 0x3f5f2ee2
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v7, 0xbefcd8a3
 ; GFX10_W64-MUBUF-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
@@ -6659,6 +6647,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:308
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:292
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:288
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:284
@@ -6680,6 +6670,7 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v11, 0x3eae29dc
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0x3efcd89c
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v7, off, s[8:11], 0 offset:248
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:244
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:240
@@ -6687,25 +6678,24 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:232
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v10, 0xbf20e7f5
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v15, 0xbf5f2ee3
-; GFX10_W64-MUBUF-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v12, 0xbf3d349e
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf523be3
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:228
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:224
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:220
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:216
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:208
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:204
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f20e7f4
-; GFX10_W64-MUBUF-NEXT:    v_and_b32_e32 v0, 0x1fc, v0
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v17, 0x3f20e7f5
-; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:300
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:200
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:832
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v13, off, s[8:11], 0 offset:828
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v11, off, s[8:11], 0 offset:824
@@ -6713,8 +6703,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:816
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:812
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0x3f3d349c
-; GFX10_W64-MUBUF-NEXT:    v_add_nc_u32_e32 v6, 0x200, v0
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:800
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v14, off, s[8:11], 0 offset:796
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v12, off, s[8:11], 0 offset:792
@@ -6722,8 +6713,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:784
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v4, off, s[8:11], 0 offset:780
 ; GFX10_W64-MUBUF-NEXT:    v_mov_b32_e32 v14, 0xbf5f2ee2
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:808
-; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:804
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v15, off, s[8:11], 0 offset:776
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v5, off, s[8:11], 0 offset:772
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v16, off, s[8:11], 0 offset:768
@@ -6742,8 +6731,6 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v9, off, s[8:11], 0 offset:716
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v8, off, s[8:11], 0 offset:712
 ; GFX10_W64-MUBUF-NEXT:    buffer_store_dword v10, off, s[8:11], 0 offset:708
-; GFX10_W64-MUBUF-NEXT:    s_clause 0x1
-; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    buffer_load_dword v1, v6, s[8:11], 0 offen
 ; GFX10_W64-MUBUF-NEXT:    s_mov_b32 s2, s5
 ; GFX10_W64-MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -7137,10 +7124,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0x3f5f2ee2
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v5, 0x3f638e37
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:320
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v24, 0xbf523be3
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[5:8], off offset:304
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[1:4], off offset:288
@@ -7149,31 +7136,29 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v3, 0xbefcd8a3
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, 0xb702e758
-; GFX11-FLATSCR-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v10, 0xb7043519
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v11, 0xbe31934f
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v18, 0xbf20e7f5
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v14, 0x3eae29d8
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v30, v13
 ; GFX11-FLATSCR-NEXT:    s_clause 0x1
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[9:12], off offset:256
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x3f20e7f4
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
-; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v20, 0x3efcd89c
-; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v9, v18
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
-; GFX11-FLATSCR-NEXT:    s_clause 0x1
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v33, v22
+; GFX11-FLATSCR-NEXT:    s_clause 0x3
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[13:16], off offset:240
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[24:27], off offset:192
 ; GFX11-FLATSCR-NEXT:    scratch_load_b32 v14, v37, off
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
 ; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v20, v0
@@ -7188,7 +7173,8 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:784
 ; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
-; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v19, v2
+; GFX11-FLATSCR-NEXT:    v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX11-FLATSCR-NEXT:    s_clause 0x4
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[33:36], off offset:768
 ; GFX11-FLATSCR-NEXT:    scratch_store_b128 off, v[29:32], off offset:752
diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
index c90d7887f2ff6..71e4755b58bf2 100644
--- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
+++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
@@ -3,6 +3,9 @@
 define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) {
 ; CHECK-LABEL: excess_soft_clause_reg_pressure:
 ; CHECK:  BB0_1: ; %for.cond28.preheader
+; CHECK:         s_load_dwordx16
+; CHECK-NEXT:    s_load_dwordx16
+
 ; CHECK:         global_load_dword
 ; CHECK-NEXT:    global_load_dword
 ; CHECK-NEXT:    global_load_dword
@@ -15,23 +18,11 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspa
 ; CHECK-NOT: v_readlane_b32
 
 ; CHECK:         s_load_dwordx16
-; CHECK-NEXT:    s_load_dwordx16
-
-; CHECK-NOT: v_writelane_b32
-; CHECK-NOT: v_readlane_b32
-
 ; CHECK:         s_load_dwordx16
-; CHECK-NEXT:    s_load_dwordx16
-
-; CHECK-NOT: v_writelane_b32
-; CHECK-NOT: v_readlane_b32
-
 ; CHECK:         s_load_dwordx16
-; CHECK-NEXT:    s_load_dwordx16
 
 ; CHECK-NOT: v_writelane_b32
 ; CHECK-NOT: v_readlane_b32
-
 entry:
   %i = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %i2 = load i64, ptr addrspace(4) %i, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 1a0f75e048cb9..da48af100d27b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -448,13 +448,13 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_nop 0
 ; GFX90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
+; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
 ; GFX90A-NEXT:    buffer_load_dword v2, off, s[8:11], 0 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[2:3]
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index b5474b8974b29..50056b62b3397 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10314,8 +10314,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
@@ -10328,10 +10327,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:160
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:144
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
@@ -10343,9 +10344,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
@@ -10359,7 +10358,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:16
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
@@ -10466,13 +10468,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 802de8037cf6b..9cb22dad86b88 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -295,9 +295,9 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
 ; GCN-NEXT:    v_writelane_b32 v40, s16, 2
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s34, 3
 ; GCN-NEXT:    s_mov_b32 s34, s32
-; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s34
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index a1771f9356014..ee2e58f2a6cc1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -98,29 +98,28 @@ body:             |
 
     ; CHECK-LABEL: name: foo
     ; CHECK: liveins: $q0, $r0, $r1, $r2, $lr
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
-    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 8
-    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $lr, -4
-    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $r7, -8
-    ; CHECK-NEXT: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
-    ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_register $r7
-    ; CHECK-NEXT: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
-    ; CHECK-NEXT: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
-    ; CHECK-NEXT: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
-    ; CHECK-NEXT: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr :: (load (s128) from %ir.src, align 4), (store (s128) into %ir.dest, align 4), (load (s128) from %ir.src2, align 4), (store (s128) into %ir.dest2, align 4) {
-    ; CHECK-NEXT:   MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
-    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
-    ; CHECK-NEXT:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
-    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
-    ; CHECK-NEXT:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 :: (load (s128) from %ir.src3, align 4), (store (s128) into %ir.dest3, align 4) {
-    ; CHECK-NEXT:   MVE_VPST 4, implicit $vpr
-    ; CHECK-NEXT:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
-    ; CHECK-NEXT:   MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
+    ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r7, killed $lr
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
+    ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
+    ; CHECK: $r7 = frame-setup tMOVr killed $sp, 14 /* CC::al */, $noreg
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    ; CHECK: renamable $r12 = t2LDRi12 $r7, 16, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.2)
+    ; CHECK: renamable $lr = t2LDRi12 $r7, 12, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.1)
+    ; CHECK: renamable $r3 = t2LDRi12 $r7, 8, 14 /* CC::al */, $noreg :: (load (s32) from %fixed-stack.0)
+    ; CHECK: BUNDLE implicit-def $vpr, implicit-def dead $q0, implicit $q0, implicit $zr, implicit killed $r0, implicit killed $r3, implicit killed $r1, implicit killed $lr {
+    ; CHECK:   MVE_VPTv4f32r 1, renamable $q0, $zr, 10, implicit-def $vpr
+    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r0, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src, align 4)
+    ; CHECK:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $r3, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest, align 4)
+    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r1, 0, 1, internal renamable $vpr, $noreg :: (load (s128) from %ir.src2, align 4)
+    ; CHECK:   MVE_VSTRWU32 internal killed renamable $q0, killed renamable $lr, 0, 1, internal renamable $vpr, $noreg :: (store (s128) into %ir.dest2, align 4)
+    ; CHECK: }
+    ; CHECK: BUNDLE implicit-def $q0, implicit killed $vpr, implicit killed $r2, implicit killed $r12 {
+    ; CHECK:   MVE_VPST 4, implicit $vpr
+    ; CHECK:   renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 1, renamable $vpr, $noreg :: (load (s128) from %ir.src3, align 4)
+    ; CHECK:   MVE_VSTRWU32 internal renamable $q0, killed renamable $r12, 0, 1, killed renamable $vpr, $noreg :: (store (s128) into %ir.dest3, align 4)
+    ; CHECK: }
+    ; CHECK: $sp = t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r7, def $pc, implicit $q0
     $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4

From fb49adb6ea8f5b476b42a118aac758006f0a4152 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 5 Nov 2025 10:17:52 +0000
Subject: [PATCH 296/313] [AMDGPU] Another test for missing S_WAIT_XCNT
 (#166154)

---
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 45 ++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index a1381ecad81e2..f964480dcc633 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1069,6 +1069,51 @@ body: |
     $sgpr0 = S_MOV_B32 $sgpr0
 ...
 
+# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
+---
+name: mixed_pending_events
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: mixed_pending_events
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_LOADCNT 1
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2, $vgpr2
+    $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
 ---
 name: pending_vmem_event_between_block
 tracksRegLiveness: true

From 28e024fb9d42fd44e0ad70638da0b6a12ba544dc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 5 Nov 2025 10:33:30 +0000
Subject: [PATCH 297/313] [X86] narrowBitOpRMW - allow additional uses of the
 BTC/R/S result (#166376)

If there are additional uses of the bit twiddled value as well as the
rmw store, we can replace them with a (re)loaded copy of the full width
integer value after the store.

There's some memory op chain handling to handle here - the additional
(re)load is chained after the new store and then any dependencies of the
original store are chained after the (re)load.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  17 +-
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 401 ++++---------------
 2 files changed, 87 insertions(+), 331 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 06b8f7614bffd..4d44227b3ecd4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53370,8 +53370,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
   //
   // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
   SDValue SrcVal, InsertBit, ShAmt;
-  if (!StoredVal.hasOneUse() ||
-      !(sd_match(StoredVal, m_And(m_Value(SrcVal),
+  if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
                                   m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
         sd_match(StoredVal,
                  m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
@@ -53442,8 +53441,18 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
     Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
   }
 
-  return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
-                      Align(), St->getMemOperand()->getFlags());
+  SDValue NewStore =
+      DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+                   Align(), St->getMemOperand()->getFlags());
+
+  // If there are other uses of StoredVal, replace with a new load of the
+  // whole (updated) value.
+  if (!StoredVal.hasOneUse()) {
+    SDValue NewLoad =
+        DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+    DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+  }
+  return NewStore;
 }
 
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index bcb14fd25b975..32d225273a6e1 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -906,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_cmpz_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movl 36(%esp,%esi), %eax
-; X86-NEXT:    movl 40(%esp,%esi), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 32(%esp,%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    xorl 12(%ecx), %esi
-; X86-NEXT:    xorl 8(%ecx), %edx
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    xorl (%ecx), %edi
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    xorl %edx, (%eax,%ecx)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    orl 12(%eax), %edx
+; X86-NEXT:    orl 8(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: complement_cmpz_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %rsi, %rax
-; SSE-NEXT:    xorq 8(%rdi), %rdx
-; SSE-NEXT:    xorq (%rdi), %rax
-; SSE-NEXT:    movq %rax, (%rdi)
-; SSE-NEXT:    movq %rdx, 8(%rdi)
-; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    shll %cl, %eax
+; SSE-NEXT:    andl $96, %ecx
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    xorl %eax, (%rdi,%rcx)
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    orq 8(%rdi), %rax
 ; SSE-NEXT:    setne %al
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: complement_cmpz_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shlxq %rcx, %rax, %rax
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    xorq 8(%rdi), %rdx
-; AVX2-NEXT:    xorq (%rdi), %rax
-; AVX2-NEXT:    movq %rax, (%rdi)
-; AVX2-NEXT:    movq %rdx, 8(%rdi)
-; AVX2-NEXT:    orq %rdx, %rax
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_cmpz_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %edx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rdx, %rsi
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    xorq 8(%rdi), %rsi
-; AVX512-NEXT:    xorq (%rdi), %rdx
-; AVX512-NEXT:    movq %rdx, (%rdi)
-; AVX512-NEXT:    movq %rsi, 8(%rdi)
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    retq
+; AVX-LABEL: complement_cmpz_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    shlxl %esi, %eax, %eax
+; AVX-NEXT:    andl $96, %esi
+; AVX-NEXT:    shrl $3, %esi
+; AVX-NEXT:    xorl %eax, (%rdi,%rsi)
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    orq 8(%rdi), %rax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -1088,247 +1019,63 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
 ; X86-LABEL: chain_reset_i256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $112, %esp
-; X86-NEXT:    movzbl 20(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $28, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl 76(%esp,%eax), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NEXT:    movl 88(%esp,%eax), %esi
-; X86-NEXT:    movl 92(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    notl %eax
-; X86-NEXT:    notl %edx
-; X86-NEXT:    notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    andl 12(%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 8(%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 20(%ecx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl 16(%ecx), %edx
-; X86-NEXT:    andl 28(%ecx), %eax
-; X86-NEXT:    andl 24(%ecx), %ebx
-; X86-NEXT:    andl 4(%ecx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl (%ecx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, 24(%ecx)
-; X86-NEXT:    movl %eax, 28(%ecx)
-; X86-NEXT:    movl %edx, 16(%ecx)
-; X86-NEXT:    movl %edi, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 12(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl (%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-2, %edi
+; X86-NEXT:    roll %cl, %edi
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $28, %ecx
+; X86-NEXT:    andl %edi, (%esi,%ecx)
+; X86-NEXT:    movl 8(%esi), %ebx
+; X86-NEXT:    movl (%esi), %edi
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 12(%esi), %ebp
+; X86-NEXT:    orl 28(%esi), %ebp
+; X86-NEXT:    orl 20(%esi), %ecx
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl 24(%esi), %ebx
+; X86-NEXT:    movl 16(%esi), %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %edi, (%edx)
 ; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    orl %ecx, %ebp
 ; X86-NEXT:    jne .LBB23_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:  .LBB23_2:
-; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: chain_reset_i256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    shrb $3, %al
-; SSE-NEXT:    andb $24, %al
-; SSE-NEXT:    negb %al
-; SSE-NEXT:    movsbq %al, %r10
-; SSE-NEXT:    movq -24(%rsp,%r10), %r8
-; SSE-NEXT:    movq -16(%rsp,%r10), %rax
-; SSE-NEXT:    shldq %cl, %r8, %rax
-; SSE-NEXT:    movq -32(%rsp,%r10), %r9
-; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -40(%rsp,%r10), %r10
-; SSE-NEXT:    shldq %cl, %r10, %r9
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    notq %r10
-; SSE-NEXT:    notq %r9
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    andq 16(%rdi), %r8
-; SSE-NEXT:    andq 8(%rdi), %r9
-; SSE-NEXT:    andq (%rdi), %r10
-; SSE-NEXT:    movq %r8, 16(%rdi)
-; SSE-NEXT:    movq %rax, 24(%rdi)
-; SSE-NEXT:    movq %r10, (%rdi)
-; SSE-NEXT:    movq %r9, 8(%rdi)
-; SSE-NEXT:    orq %rax, %r9
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    movl (%rsi), %eax
-; SSE-NEXT:    movl %r10d, (%rsi)
-; SSE-NEXT:    movl (%rdx), %ecx
-; SSE-NEXT:    addl %ecx, %eax
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: chain_reset_i256:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %ecx, %eax
-; AVX2-NEXT:    shrb $3, %al
-; AVX2-NEXT:    andb $24, %al
-; AVX2-NEXT:    negb %al
-; AVX2-NEXT:    movsbq %al, %rax
-; AVX2-NEXT:    movq -32(%rsp,%rax), %r8
-; AVX2-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX2-NEXT:    movq %r9, %r10
-; AVX2-NEXT:    shldq %cl, %r8, %r10
-; AVX2-NEXT:    movq -40(%rsp,%rax), %r11
-; AVX2-NEXT:    movq -16(%rsp,%rax), %rax
-; AVX2-NEXT:    shldq %cl, %r9, %rax
-; AVX2-NEXT:    shldq %cl, %r11, %r8
-; AVX2-NEXT:    andnq 24(%rdi), %rax, %rax
-; AVX2-NEXT:    andnq 16(%rdi), %r10, %r9
-; AVX2-NEXT:    andnq 8(%rdi), %r8, %r8
-; AVX2-NEXT:    shlxq %rcx, %r11, %rcx
-; AVX2-NEXT:    andnq (%rdi), %rcx, %rcx
-; AVX2-NEXT:    movq %r9, 16(%rdi)
-; AVX2-NEXT:    movq %rax, 24(%rdi)
-; AVX2-NEXT:    movq %rcx, (%rdi)
-; AVX2-NEXT:    movq %r8, 8(%rdi)
-; AVX2-NEXT:    orq %rax, %r8
-; AVX2-NEXT:    orq %rcx, %r9
-; AVX2-NEXT:    movl (%rsi), %eax
-; AVX2-NEXT:    movl %ecx, (%rsi)
-; AVX2-NEXT:    movl (%rdx), %ecx
-; AVX2-NEXT:    addl %ecx, %eax
-; AVX2-NEXT:    orq %r8, %r9
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: chain_reset_i256:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; AVX512-NEXT:    movl %ecx, %eax
-; AVX512-NEXT:    shrb $3, %al
-; AVX512-NEXT:    andb $24, %al
-; AVX512-NEXT:    negb %al
-; AVX512-NEXT:    movsbq %al, %rax
-; AVX512-NEXT:    movq -40(%rsp,%rax), %r8
-; AVX512-NEXT:    movq -32(%rsp,%rax), %r9
-; AVX512-NEXT:    movq -24(%rsp,%rax), %r10
-; AVX512-NEXT:    movq %r10, %r11
-; AVX512-NEXT:    shldq %cl, %r9, %r11
-; AVX512-NEXT:    movq -16(%rsp,%rax), %rax
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    shlxq %rcx, %r8, %r10
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %r8, %r9
-; AVX512-NEXT:    andnq 24(%rdi), %rax, %rax
-; AVX512-NEXT:    andnq 16(%rdi), %r11, %rcx
-; AVX512-NEXT:    andnq 8(%rdi), %r9, %r8
-; AVX512-NEXT:    andnq (%rdi), %r10, %r9
-; AVX512-NEXT:    movq %rcx, 16(%rdi)
-; AVX512-NEXT:    movq %rax, 24(%rdi)
-; AVX512-NEXT:    movq %r9, (%rdi)
-; AVX512-NEXT:    movq %r8, 8(%rdi)
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    orq %r9, %rcx
-; AVX512-NEXT:    movl (%rsi), %eax
-; AVX512-NEXT:    movl %r9d, (%rsi)
-; AVX512-NEXT:    movl (%rdx), %edx
-; AVX512-NEXT:    addl %edx, %eax
-; AVX512-NEXT:    orq %r8, %rcx
-; AVX512-NEXT:    cmovnel %edx, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: chain_reset_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    movl $-2, %eax
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $28, %ecx
+; X64-NEXT:    andl %eax, (%rdi,%rcx)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq 8(%rdi), %r8
+; X64-NEXT:    orq 24(%rdi), %r8
+; X64-NEXT:    movq 16(%rdi), %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    movl %ecx, (%rsi)
+; X64-NEXT:    movl (%rdx), %ecx
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    orq %r8, %rdi
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
   %rem = and i32 %position, 255
   %ofs = zext nneg i32 %rem to i256
   %bit = shl nuw i256 1, %ofs

From 86d9e1c035fe14a18751523108ffdef092910b14 Mon Sep 17 00:00:00 2001
From: SKill <skill@google.com>
Date: Wed, 5 Nov 2025 11:45:32 +0100
Subject: [PATCH 298/313] [clang] Delete duplicate code in sourcemanager
 (#166236)

Now that the `SourceManager::getExpansionLoc` and
`SourceManager::getSpellingLoc` functions are efficient, delete
unnecessary code duplicate in `SourceManager::getDecomposedExpansionLoc`
and `SourceManager::getDecomposedSpellingLoc` methods.
---
 clang/include/clang/Basic/SourceManager.h | 25 ++--------------
 clang/lib/Basic/SourceManager.cpp         | 35 -----------------------
 2 files changed, 2 insertions(+), 58 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index ed967fd47dc83..6d9d074d78026 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1286,16 +1286,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// If the location is an expansion record, walk through it until we find
   /// the final location expanded.
   FileIDAndOffset getDecomposedExpansionLoc(SourceLocation Loc) const {
-    FileID FID = getFileID(Loc);
-    auto *E = getSLocEntryOrNull(FID);
-    if (!E)
-      return std::make_pair(FileID(), 0);
-
-    unsigned Offset = Loc.getOffset()-E->getOffset();
-    if (Loc.isFileID())
-      return std::make_pair(FID, Offset);
-
-    return getDecomposedExpansionLocSlowCase(E);
+    return getDecomposedLoc(getExpansionLoc(Loc));
   }
 
   /// Decompose the specified location into a raw FileID + Offset pair.
@@ -1303,15 +1294,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// If the location is an expansion record, walk through it until we find
   /// its spelling record.
   FileIDAndOffset getDecomposedSpellingLoc(SourceLocation Loc) const {
-    FileID FID = getFileID(Loc);
-    auto *E = getSLocEntryOrNull(FID);
-    if (!E)
-      return std::make_pair(FileID(), 0);
-
-    unsigned Offset = Loc.getOffset()-E->getOffset();
-    if (Loc.isFileID())
-      return std::make_pair(FID, Offset);
-    return getDecomposedSpellingLocSlowCase(E, Offset);
+    return getDecomposedLoc(getSpellingLoc(Loc));
   }
 
   /// Returns the "included/expanded in" decomposed location of the given
@@ -1979,10 +1962,6 @@ class SourceManager : public RefCountedBase<SourceManager> {
   SourceLocation getSpellingLocSlowCase(SourceLocation Loc) const;
   SourceLocation getFileLocSlowCase(SourceLocation Loc) const;
 
-  FileIDAndOffset
-  getDecomposedExpansionLocSlowCase(const SrcMgr::SLocEntry *E) const;
-  FileIDAndOffset getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
-                                                   unsigned Offset) const;
   void computeMacroArgsCache(MacroArgsMap &MacroArgsCache, FileID FID) const;
   void associateFileChunkWithMacroArgExp(MacroArgsMap &MacroArgsCache,
                                          FileID FID,
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 97aa0f2aa59b9..7dc81c50f87a2 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -928,41 +928,6 @@ SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const {
   return Loc;
 }
 
-FileIDAndOffset SourceManager::getDecomposedExpansionLocSlowCase(
-    const SrcMgr::SLocEntry *E) const {
-  // If this is an expansion record, walk through all the expansion points.
-  FileID FID;
-  SourceLocation Loc;
-  unsigned Offset;
-  do {
-    Loc = E->getExpansion().getExpansionLocStart();
-
-    FID = getFileID(Loc);
-    E = &getSLocEntry(FID);
-    Offset = Loc.getOffset()-E->getOffset();
-  } while (!Loc.isFileID());
-
-  return std::make_pair(FID, Offset);
-}
-
-FileIDAndOffset
-SourceManager::getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
-                                                unsigned Offset) const {
-  // If this is an expansion record, walk through all the expansion points.
-  FileID FID;
-  SourceLocation Loc;
-  do {
-    Loc = E->getExpansion().getSpellingLoc();
-    Loc = Loc.getLocWithOffset(Offset);
-
-    FID = getFileID(Loc);
-    E = &getSLocEntry(FID);
-    Offset = Loc.getOffset()-E->getOffset();
-  } while (!Loc.isFileID());
-
-  return std::make_pair(FID, Offset);
-}
-
 /// getImmediateSpellingLoc - Given a SourceLocation object, return the
 /// spelling location referenced by the ID.  This is the first level down
 /// towards the place where the characters that make up the lexed token can be

From a65867ac319b0fbf5891b39df3c7b660070d63bd Mon Sep 17 00:00:00 2001
From: Elvina Yakubova <eyakubova@nvidia.com>
Date: Wed, 5 Nov 2025 10:51:31 +0000
Subject: [PATCH 299/313] [BOLT][AArch64] Fix search to proceed upwards from
 memcpy call (#166182)

The search should proceed from CallInst to the beginning of BB since X2
can be rewritten and we need to catch the most recent write before the
call.

Patch by Yafet Beyene alulayafet@gmail.com
---
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp |  2 +-
 bolt/test/runtime/AArch64/inline-memcpy.s        | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 8a496c566b06b..57db6a436c5c6 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -2804,7 +2804,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     BitVector WrittenRegs(RegInfo->getNumRegs());
     const BitVector &SizeRegAliases = getAliases(SizeReg);
 
-    for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+    for (auto InstIt = CallInst; InstIt != BB.begin(); --InstIt) {
       const MCInst &Inst = *InstIt;
       WrittenRegs.reset();
       getWrittenRegs(Inst, WrittenRegs);
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index dc59a08b889a7..badff299603a0 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
+# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
@@ -84,6 +84,9 @@
 # CHECK-ASM-LABEL: <test_register_move_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
+# CHECK-ASM-LABEL: <test_x2_rewrite_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 # Live-in parameter should NOT be inlined (size unknown at compile time)
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
@@ -273,6 +276,15 @@ test_register_move_negative:
 	ret
 	.size	test_register_move_negative, .-test_register_move_negative
 
+	.globl  test_x2_rewrite_negative
+	.type   test_x2_rewrite_negative,@function
+test_x2_rewrite_negative:
+	mov     x2, #8
+	ldr     x2, [sp, #24]
+	bl      memcpy
+	ret
+	.size   test_x2_rewrite_negative, .-test_x2_rewrite_negative
+
 	.globl	test_live_in_negative
 	.type	test_live_in_negative,@function
 test_live_in_negative:

From 5821b09e5f348cb750d4b9cc2f532ca34cd8a6c6 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 5 Nov 2025 11:52:59 +0100
Subject: [PATCH 300/313] [clang][bytecode] Print primitive arrays in
 Descriptor::dumpFull() (#166393)

And recurse into records properly.
---
 clang/lib/AST/ByteCode/Disasm.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index fd0903f2e652c..638028f84ff24 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -436,8 +436,28 @@ LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset,
 
       FO += ElemDesc->getAllocSize();
     }
+  } else if (isPrimitiveArray()) {
+    OS.indent(Spaces) << "Elements: " << getNumElems() << '\n';
+    OS.indent(Spaces) << "Element type: " << primTypeToString(getPrimType())
+                      << '\n';
+    unsigned FO = Offset + sizeof(InitMapPtr);
+    for (unsigned I = 0; I != getNumElems(); ++I) {
+      OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n';
+      FO += getElemSize();
+    }
   } else if (isRecord()) {
     ElemRecord->dump(OS, Indent + 1, Offset);
+    unsigned I = 0;
+    for (const Record::Field &F : ElemRecord->fields()) {
+      OS.indent(Spaces) << "- Field " << I << ": ";
+      {
+        ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
+        OS << F.Decl->getName();
+      }
+      OS << ". Offset " << (Offset + F.Offset) << "\n";
+      F.Desc->dumpFull(Offset + F.Offset, Indent + 1);
+      ++I;
+    }
   } else if (isPrimitive()) {
   } else {
   }

From d249e67a6a7c36a2ffcf08839810a8efa4603f18 Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Wed, 5 Nov 2025 11:14:02 +0000
Subject: [PATCH 301/313] [libc][math] Disable `FEnvSafeTest.cpp` if AArch64
 target has no FP support (#166370)

The `FEnvSafeTest.cpp` test fails on AArch64 soft nofp configurations
because LLVM libc does not provide a floating-point environment in these
configurations.

This patch adds another preprocessor guard on `__ARM_FP` to disable the
test on those.
---
 libc/test/UnitTest/FEnvSafeTest.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index 4393f9d5e5c3b..64f50d7be7fe3 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) {
 
 void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv,
                                   const fenv_t &after_fenv) {
-#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC)
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \
+    defined(__ARM_FP)
   using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState;
   const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv);
   const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv);

From fedd3b0399e78f47c5795f436b88722319832172 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 5 Nov 2025 12:15:48 +0100
Subject: [PATCH 302/313] [clang][bytecode] Remove dummy variables once they
 are proper globals (#166174)

Dummy variables have an entry in `Program::Globals`, but they are not
added to `GlobalIndices`. When registering redeclarations, we used to
only patch up the global indices, but that left the dummy variables
alone. Update the dummy variables of all redeclarations as well.


Fixes https://github.com/llvm/llvm-project/issues/165952
---
 clang/lib/AST/ByteCode/Program.cpp  | 29 +++++++++++++++++++++++++----
 clang/lib/AST/ByteCode/Program.h    |  1 -
 clang/test/AST/ByteCode/records.cpp | 11 +++++++++++
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index e0b2852f0e906..2425373ab2ef8 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -218,21 +218,42 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) {
     return std::nullopt;
 
   Global *NewGlobal = Globals[*Idx];
+  // Note that this loop has one iteration where Redecl == VD.
   for (const Decl *Redecl : VD->redecls()) {
-    unsigned &PIdx = GlobalIndices[Redecl];
+
+    // If this redecl was registered as a dummy variable, it is now a proper
+    // global variable and points to the block we just created.
+    if (auto DummyIt = DummyVariables.find(Redecl);
+        DummyIt != DummyVariables.end()) {
+      assert(!Globals[DummyIt->second]->block()->hasPointers());
+      Globals[DummyIt->second] = NewGlobal;
+      DummyVariables.erase(DummyIt);
+    }
+    // If the redeclaration hasn't been registered yet at all, we just set its
+    // global index to Idx. If it has been registered yet, it might have
+    // pointers pointing to it and we need to transfer those pointers to the new
+    // block.
+    auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl);
+    if (Inserted) {
+      GlobalIndices[Redecl] = *Idx;
+      continue;
+    }
+
     if (Redecl != VD) {
-      if (Block *RedeclBlock = Globals[PIdx]->block();
+      if (Block *RedeclBlock = Globals[Iter->second]->block();
           RedeclBlock->isExtern()) {
-        Globals[PIdx] = NewGlobal;
+
         // All pointers pointing to the previous extern decl now point to the
         // new decl.
         // A previous iteration might've already fixed up the pointers for this
         // global.
         if (RedeclBlock != NewGlobal->block())
           RedeclBlock->movePointersTo(NewGlobal->block());
+
+        Globals[Iter->second] = NewGlobal;
       }
     }
-    PIdx = *Idx;
+    Iter->second = *Idx;
   }
 
   return *Idx;
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 28fcc97f5339d..cc9127dc77860 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -205,7 +205,6 @@ class Program final {
     const Block *block() const { return &B; }
 
   private:
-    /// Required metadata - does not actually track pointers.
     Block B;
   };
 
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 83f32c97c50c7..4799ebe25dde1 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody {
   }
   int n = f(0); // both-note {{instantiation of}}
 }
+
+namespace StaticRedecl {
+  struct T {
+    static T tt;
+    constexpr T() : p(&tt) {}
+    T *p;
+  };
+  T T::tt;
+  constexpr T t;
+  static_assert(t.p == &T::tt, "");
+}

From e8564830c19e6fd5bfa38488c06f332b214ea858 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Wed, 5 Nov 2025 19:25:53 +0800
Subject: [PATCH 303/313] [clang-tidy][doc] add more information in
 twine-local's document (#166266)

explain more about use-after-free in llvm-twine-local
add note about manually adjusting code after applying fix-it.
fixed: #154810
---
 .../clang-tidy/checks/llvm/twine-local.rst     | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
index ec9ef1c60913c..6c994a48d83de 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
@@ -14,3 +14,21 @@ should be generally avoided.
   // becomes
 
   static std::string Moo = (Twine("bark") + "bah").str();
+
+The ``Twine`` does not own the memory of its contents, so it is not
+recommended to use ``Twine`` created from temporary strings or string literals.
+
+.. code-block:: c++
+
+  static Twine getModuleIdentifier(StringRef moduleName) {
+    return moduleName + "_module";
+  }
+  void foo() {
+    Twine result = getModuleIdentifier(std::string{"abc"} + "def");
+    // temporary std::string is destroyed here, result is dangling
+  }
+
+After applying this fix-it hints, the code will use ``std::string`` instead of
+``Twine`` for local variables. However, ``Twine`` has lots of methods that
+are incompatible with ``std::string``, so the user may need to adjust the code
+manually after applying the fix-it hints.

From c1dc064ba063f0d679f1a6d6aeef99becea8b709 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Wed, 5 Nov 2025 19:32:34 +0800
Subject: [PATCH 304/313] [CIR] Add support for storing into _Atomic variables
 (#165872)

---
 clang/include/clang/CIR/MissingFeatures.h |   2 +
 clang/lib/CIR/CodeGen/CIRGenAtomic.cpp    | 121 +++++++++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp      |  14 ++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h    |   7 +-
 clang/test/CIR/CodeGen/atomic.c           |  26 +++++
 5 files changed, 162 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 48ef8be9fb782..6f099a7027a10 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -180,6 +180,8 @@ struct MissingFeatures {
   static bool atomicSyncScopeID() { return false; }
   static bool atomicTypes() { return false; }
   static bool atomicUseLibCall() { return false; }
+  static bool atomicMicrosoftVolatile() { return false; }
+  static bool atomicOpenMP() { return false; }
 
   // Global ctor handling
   static bool globalCtorLexOrder() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index 7db6e283ec0a5..cd4c1f0e5b769 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -27,6 +27,7 @@ class AtomicInfo {
   CharUnits atomicAlign;
   CharUnits valueAlign;
   TypeEvaluationKind evaluationKind = cir::TEK_Scalar;
+  bool useLibCall = true;
   LValue lvalue;
   mlir::Location loc;
 
@@ -62,8 +63,8 @@ class AtomicInfo {
       assert(!cir::MissingFeatures::atomicInfo());
       cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue");
     }
-
-    assert(!cir::MissingFeatures::atomicUseLibCall());
+    useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic(
+        atomicSizeInBits, ctx.toBits(lvalue.getAlignment()));
   }
 
   QualType getValueType() const { return valueTy; }
@@ -75,6 +76,8 @@ class AtomicInfo {
     assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer());
     return nullptr;
   }
+  bool shouldUseLibCall() const { return useLibCall; }
+  const LValue &getAtomicLValue() const { return lvalue; }
   Address getAtomicAddress() const {
     mlir::Type elemTy;
     if (lvalue.isSimple()) {
@@ -96,6 +99,8 @@ class AtomicInfo {
 
   bool emitMemSetZeroIfNecessary() const;
 
+  mlir::Value getScalarRValValueOrNull(RValue rvalue) const;
+
   /// Cast the given pointer to an integer pointer suitable for atomic
   /// operations on the source.
   Address castToAtomicIntPointer(Address addr) const;
@@ -105,6 +110,9 @@ class AtomicInfo {
   /// copy the value across.
   Address convertToAtomicIntPointer(Address addr) const;
 
+  /// Converts a rvalue to integer value.
+  mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const;
+
   /// Copy an atomic r-value into atomic-layout memory.
   void emitCopyIntoMemory(RValue rvalue) const;
 
@@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const {
   return tempAlloca;
 }
 
+mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const {
+  if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple()))
+    return rvalue.getValue();
+  return nullptr;
+}
+
 Address AtomicInfo::castToAtomicIntPointer(Address addr) const {
   auto intTy = mlir::dyn_cast<cir::IntType>(addr.getElementType());
   // Don't bother with int casts if the integer size is the same.
@@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
     return false;
 
   cgf.cgm.errorNYI(loc,
-                   "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero");
+                   "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero");
   return false;
 }
 
+/// Return true if \param valueTy is a type that should be casted to integer
+/// around the atomic memory operation. If \param cmpxchg is true, then the
+/// cast of a floating point type is made as that instruction can not have
+/// floating point operands.  TODO: Allow compare-and-exchange and FP - see
+/// comment in CIRGenAtomicExpandPass.cpp.
+static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) {
+  if (cir::isAnyFloatingPointType(valueTy))
+    return isa<cir::FP80Type>(valueTy) || cmpxchg;
+  return !isa<cir::IntType>(valueTy) && !isa<cir::PointerType>(valueTy);
+}
+
+mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const {
+  // If we've got a scalar value of the right size, try to avoid going
+  // through memory. Floats get casted if needed by AtomicExpandPass.
+  if (mlir::Value value = getScalarRValValueOrNull(rvalue)) {
+    if (!shouldCastToInt(value.getType(), cmpxchg))
+      return cgf.emitToMemory(value, valueTy);
+
+    cgf.cgm.errorNYI(
+        loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int");
+    return nullptr;
+  }
+
+  cgf.cgm.errorNYI(
+      loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int");
+  return nullptr;
+}
+
 /// Copy an r-value into memory as part of storing to an atomic type.
 /// This needs to create a bit-pattern suitable for atomic operations.
 void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
@@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       e->getExprLoc());
 }
 
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) {
+  bool isVolatile = dest.isVolatileQualified();
+  auto order = cir::MemOrder::SequentiallyConsistent;
+  if (!dest.getType()->isAtomicType()) {
+    assert(!cir::MissingFeatures::atomicMicrosoftVolatile());
+  }
+  return emitAtomicStore(rvalue, dest, order, isVolatile, isInit);
+}
+
+/// Emit a store to an l-value of atomic type.
+///
+/// Note that the r-value is expected to be an r-value of the atomic type; this
+/// means that for aggregate r-values, it should include storage for any padding
+/// that was necessary.
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest,
+                                     cir::MemOrder order, bool isVolatile,
+                                     bool isInit) {
+  // If this is an aggregate r-value, it should agree in type except
+  // maybe for address-space qualification.
+  mlir::Location loc = dest.getPointer().getLoc();
+  assert(!rvalue.isAggregate() ||
+         rvalue.getAggregateAddress().getElementType() ==
+             dest.getAddress().getElementType());
+
+  AtomicInfo atomics(*this, dest, loc);
+  LValue lvalue = atomics.getAtomicLValue();
+
+  if (lvalue.isSimple()) {
+    // If this is an initialization, just put the value there normally.
+    if (isInit) {
+      atomics.emitCopyIntoMemory(rvalue);
+      return;
+    }
+
+    // Check whether we should use a library call.
+    if (atomics.shouldUseLibCall()) {
+      assert(!cir::MissingFeatures::atomicUseLibCall());
+      cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call");
+      return;
+    }
+
+    // Okay, we're doing this natively.
+    mlir::Value valueToStore = atomics.convertRValueToInt(rvalue);
+
+    // Do the atomic store.
+    Address addr = atomics.getAtomicAddress();
+    if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) {
+      if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) {
+        addr = atomics.castToAtomicIntPointer(addr);
+        valueToStore =
+            builder.createIntCast(valueToStore, addr.getElementType());
+      }
+    }
+    cir::StoreOp store = builder.createStore(loc, valueToStore, addr);
+
+    // Initializations don't need to be atomic.
+    if (!isInit) {
+      assert(!cir::MissingFeatures::atomicOpenMP());
+      store.setMemOrder(order);
+    }
+
+    // Other decoration.
+    if (isVolatile)
+      store.setIsVolatile(true);
+
+    assert(!cir::MissingFeatures::opLoadStoreTbaa());
+    return;
+  }
+
+  cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue");
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+}
+
 void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
   AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange()));
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 4fb178df0e508..422fa1cf5ad2e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
 
 void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
-                                       bool isInit, bool isNontemporal) {
+                                       LValueBaseInfo baseInfo, bool isInit,
+                                       bool isNontemporal) {
   assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
@@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
 
   value = emitToMemory(value, ty);
 
-  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+  assert(!cir::MissingFeatures::opLoadStoreTbaa());
+  LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo);
+  if (ty->isAtomicType() ||
+      (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) {
+    emitAtomicStore(RValue::get(value), atomicLValue, isInit);
+    return;
+  }
 
   // Update the alloca with more info on initialization.
   assert(addr.getPointer() && "expected pointer to exist");
@@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
   }
 
   emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
-                    lvalue.getType(), isInit, /*isNontemporal=*/false);
+                    lvalue.getType(), lvalue.getBaseInfo(), isInit,
+                    /*isNontemporal=*/false);
 }
 
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index dece642eb13b6..1c52a78d72e33 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1271,6 +1271,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   RValue emitAtomicExpr(AtomicExpr *e);
   void emitAtomicInit(Expr *init, LValue dest);
+  void emitAtomicStore(RValue rvalue, LValue dest, bool isInit);
+  void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order,
+                       bool isVolatile, bool isInit);
 
   AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d,
                                     mlir::OpBuilder::InsertPoint ip = {});
@@ -1680,8 +1683,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                           bool isInit);
 
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
-                         clang::QualType ty, bool isInit = false,
-                         bool isNontemporal = false);
+                         clang::QualType ty, LValueBaseInfo baseInfo,
+                         bool isInit = false, bool isNontemporal = false);
   void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
 
   /// Store the specified rvalue into the specified
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 65799881a0cbe..d5bea8446d730 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -46,6 +46,32 @@ void f2(void) {
 // OGCG-NEXT:    store i32 42, ptr %[[SLOT]], align 4
 // OGCG:       }
 
+void f3(_Atomic(int) *p) {
+  *p = 42;
+}
+
+// CIR-LABEL: @f3
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: @f3
+// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f3
+// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+void f4(_Atomic(float) *p) {
+  *p = 3.14;
+}
+
+// CIR-LABEL: @f4
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr<!cir.float>
+
+// LLVM-LABEL: @f4
+// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f4
+// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
 void load(int *ptr) {
   int x;
   __atomic_load(ptr, &x, __ATOMIC_RELAXED);

From a38e0942407ef3395264831ef971838b69d2a652 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 5 Nov 2025 21:04:32 +0900
Subject: [PATCH 305/313] [mlir] Dialect Conversion: Add support for post-order
 legalization order (#166292)

By default, the dialect conversion driver processes operations in
pre-order: the initial worklist is populated pre-order. (New/modified
operations are immediately legalized recursively.)

This commit adds a new API for selective post-order legalization.
Patterns can request an operation / region legalization via
`ConversionPatternRewriter::legalize`. They can call these helper
functions on nested regions before rewriting the operation itself.

Note: In rollback mode, a failed recursive legalization typically leads
to a conversion failure. Since recursive legalization is performed by
separate pattern applications, there is no way for the original pattern
to recover from such a failure.
---
 .../mlir/Transforms/DialectConversion.h       |  25 +++-
 .../Transforms/Utils/DialectConversion.cpp    | 120 +++++++++++++-----
 mlir/test/Transforms/test-legalizer-full.mlir |  18 +++
 .../Transforms/test-legalizer-rollback.mlir   |  19 +++
 mlir/test/Transforms/test-legalizer.mlir      |  32 +++++
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  22 +++-
 6 files changed, 199 insertions(+), 37 deletions(-)

diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index ed7e2a08ebfd9..5ac9e26e8636d 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -981,6 +981,28 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// Return a reference to the internal implementation.
   detail::ConversionPatternRewriterImpl &getImpl();
 
+  /// Attempt to legalize the given operation. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the operation was legalized, "failure" otherwise.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// the operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Operation *op);
+
+  /// Attempt to legalize the given region. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the region was legalized, "failure" otherwise.
+  ///
+  /// If the current pattern runs with a type converter, the entry block
+  /// signature will be converted before legalizing the operations in the
+  /// region.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// an operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Region *r);
+
 private:
   // Allow OperationConverter to construct new rewriters.
   friend struct OperationConverter;
@@ -989,7 +1011,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// conversions. They apply some IR rewrites in a delayed fashion and could
   /// bring the IR into an inconsistent state when used standalone.
   explicit ConversionPatternRewriter(MLIRContext *ctx,
-                                     const ConversionConfig &config);
+                                     const ConversionConfig &config,
+                                     OperationConverter &converter);
 
   // Hide unsupported pattern rewriter API.
   using OpBuilder::setListener;
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 2fe06970eb568..f8c38fadbd229 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -92,6 +92,22 @@ static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) {
   return pt;
 }
 
+namespace {
+enum OpConversionMode {
+  /// In this mode, the conversion will ignore failed conversions to allow
+  /// illegal operations to co-exist in the IR.
+  Partial,
+
+  /// In this mode, all operations must be legal for the given target for the
+  /// conversion to succeed.
+  Full,
+
+  /// In this mode, operations are analyzed for legality. No actual rewrites are
+  /// applied to the operations on success.
+  Analysis,
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // ConversionValueMapping
 //===----------------------------------------------------------------------===//
@@ -866,8 +882,9 @@ namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(ConversionPatternRewriter &rewriter,
-                                         const ConversionConfig &config)
-      : rewriter(rewriter), config(config),
+                                         const ConversionConfig &config,
+                                         OperationConverter &opConverter)
+      : rewriter(rewriter), config(config), opConverter(opConverter),
         notifyingRewriter(rewriter.getContext(), config.listener) {}
 
   //===--------------------------------------------------------------------===//
@@ -1124,6 +1141,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Dialect conversion configuration.
   const ConversionConfig &config;
 
+  /// The operation converter to use for recursive legalization.
+  OperationConverter &opConverter;
+
   /// A set of erased operations. This set is utilized only if
   /// `allowPatternRollback` is set to "false". Conceptually, this set is
   /// similar to `replacedOps` (which is maintained when the flag is set to
@@ -2084,9 +2104,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
 //===----------------------------------------------------------------------===//
 
 ConversionPatternRewriter::ConversionPatternRewriter(
-    MLIRContext *ctx, const ConversionConfig &config)
-    : PatternRewriter(ctx),
-      impl(new detail::ConversionPatternRewriterImpl(*this, config)) {
+    MLIRContext *ctx, const ConversionConfig &config,
+    OperationConverter &opConverter)
+    : PatternRewriter(ctx), impl(new detail::ConversionPatternRewriterImpl(
+                                *this, config, opConverter)) {
   setListener(impl.get());
 }
 
@@ -2207,6 +2228,37 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
   return success();
 }
 
+LogicalResult ConversionPatternRewriter::legalize(Region *r) {
+  // Fast path: If the region is empty, there is nothing to legalize.
+  if (r->empty())
+    return success();
+
+  // Gather a list of all operations to legalize. This is done before
+  // converting the entry block signature because unrealized_conversion_cast
+  // ops should not be included.
+  SmallVector<Operation *> ops;
+  for (Block &b : *r)
+    for (Operation &op : b)
+      ops.push_back(&op);
+
+  // If the current pattern runs with a type converter, convert the entry block
+  // signature.
+  if (const TypeConverter *converter = impl->currentTypeConverter) {
+    std::optional<TypeConverter::SignatureConversion> conversion =
+        converter->convertBlockSignature(&r->front());
+    if (!conversion)
+      return failure();
+    applySignatureConversion(&r->front(), *conversion, converter);
+  }
+
+  // Legalize all operations in the region.
+  for (Operation *op : ops)
+    if (failed(legalize(op)))
+      return failure();
+
+  return success();
+}
+
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
@@ -3192,22 +3244,6 @@ static void reconcileUnrealizedCasts(
 // OperationConverter
 //===----------------------------------------------------------------------===//
 
-namespace {
-enum OpConversionMode {
-  /// In this mode, the conversion will ignore failed conversions to allow
-  /// illegal operations to co-exist in the IR.
-  Partial,
-
-  /// In this mode, all operations must be legal for the given target for the
-  /// conversion to succeed.
-  Full,
-
-  /// In this mode, operations are analyzed for legality. No actual rewrites are
-  /// applied to the operations on success.
-  Analysis,
-};
-} // namespace
-
 namespace mlir {
 // This class converts operations to a given conversion target via a set of
 // rewrite patterns. The conversion behaves differently depending on the
@@ -3217,16 +3253,20 @@ struct OperationConverter {
                               const FrozenRewritePatternSet &patterns,
                               const ConversionConfig &config,
                               OpConversionMode mode)
-      : rewriter(ctx, config), opLegalizer(rewriter, target, patterns),
+      : rewriter(ctx, config, *this), opLegalizer(rewriter, target, patterns),
         mode(mode) {}
 
   /// Converts the given operations to the conversion target.
   LogicalResult convertOperations(ArrayRef<Operation *> ops);
 
-private:
-  /// Converts an operation with the given rewriter.
-  LogicalResult convert(Operation *op);
+  /// Converts a single operation. If `isRecursiveLegalization` is "true", the
+  /// conversion is a recursive legalization request, triggered from within a
+  /// pattern. In that case, do not emit errors because there will be another
+  /// attempt at legalizing the operation later (via the regular pre-order
+  /// legalization mechanism).
+  LogicalResult convert(Operation *op, bool isRecursiveLegalization = false);
 
+private:
   /// The rewriter to use when converting operations.
   ConversionPatternRewriter rewriter;
 
@@ -3238,32 +3278,42 @@ struct OperationConverter {
 };
 } // namespace mlir
 
-LogicalResult OperationConverter::convert(Operation *op) {
+LogicalResult ConversionPatternRewriter::legalize(Operation *op) {
+  return impl->opConverter.convert(op, /*isRecursiveLegalization=*/true);
+}
+
+LogicalResult OperationConverter::convert(Operation *op,
+                                          bool isRecursiveLegalization) {
   const ConversionConfig &config = rewriter.getConfig();
 
   // Legalize the given operation.
   if (failed(opLegalizer.legalize(op))) {
     // Handle the case of a failed conversion for each of the different modes.
     // Full conversions expect all operations to be converted.
-    if (mode == OpConversionMode::Full)
-      return op->emitError()
-             << "failed to legalize operation '" << op->getName() << "'";
+    if (mode == OpConversionMode::Full) {
+      if (!isRecursiveLegalization)
+        op->emitError() << "failed to legalize operation '" << op->getName()
+                        << "'";
+      return failure();
+    }
     // Partial conversions allow conversions to fail iff the operation was not
     // explicitly marked as illegal. If the user provided a `unlegalizedOps`
     // set, non-legalizable ops are added to that set.
     if (mode == OpConversionMode::Partial) {
-      if (opLegalizer.isIllegal(op))
-        return op->emitError()
-               << "failed to legalize operation '" << op->getName()
-               << "' that was explicitly marked illegal";
-      if (config.unlegalizedOps)
+      if (opLegalizer.isIllegal(op)) {
+        if (!isRecursiveLegalization)
+          op->emitError() << "failed to legalize operation '" << op->getName()
+                          << "' that was explicitly marked illegal";
+        return failure();
+      }
+      if (config.unlegalizedOps && !isRecursiveLegalization)
         config.unlegalizedOps->insert(op);
     }
   } else if (mode == OpConversionMode::Analysis) {
     // Analysis conversions don't fail if any operations fail to legalize,
     // they are only interested in the operations that were successfully
     // legalized.
-    if (config.legalizableOps)
+    if (config.legalizableOps && !isRecursiveLegalization)
       config.legalizableOps->insert(op);
   }
   return success();
diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir
index 42cec68b9fbbb..8da9109a32762 100644
--- a/mlir/test/Transforms/test-legalizer-full.mlir
+++ b/mlir/test/Transforms/test-legalizer-full.mlir
@@ -72,3 +72,21 @@ builtin.module {
   }
 
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// expected-remark@+1 {{applyFullConversion failed}}
+builtin.module {
+func.func @test_preorder_legalization() {
+  // expected-error@+1 {{failed to legalize operation 'test.post_order_legalization'}}
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // Not-explicitly-legal ops are not allowed to survive.
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer-rollback.mlir b/mlir/test/Transforms/test-legalizer-rollback.mlir
index 71e11782e14b0..4bcca6b7e5228 100644
--- a/mlir/test/Transforms/test-legalizer-rollback.mlir
+++ b/mlir/test/Transforms/test-legalizer-rollback.mlir
@@ -163,3 +163,22 @@ func.func @create_unregistered_op_in_pattern() -> i32 {
   "test.return"(%0) : (i32) -> ()
 }
 }
+
+// -----
+
+// CHECK-LABEL: func @test_failed_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:     %[[r:.*]] = "test.illegal_op_g"() : () -> i32
+//       CHECK:     "test.return"(%[[r]]) : (i32) -> ()
+//       CHECK:   }) : () -> ()
+// expected-remark @+1 {{applyPartialConversion failed}}
+module {
+func.func @test_failed_preorder_legalization() {
+  // expected-error @+1 {{failed to legalize operation 'test.post_order_legalization' that was explicitly marked illegal}}
+  "test.post_order_legalization"() ({
+    %0 = "test.illegal_op_g"() : () -> (i32)
+    "test.return"(%0) : (i32) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 7c43bb7bface0..88a71cc26ab0c 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -448,3 +448,35 @@ func.func @test_working_1to1_pattern(%arg0: f16) {
   "test.type_consumer"(%arg0) : (f16) -> ()
   "test.return"() : () -> ()
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// CHECK: notifyBlockInserted into test.post_order_legalization: was unlinked
+// CHECK: notifyOperationInserted: test.invalid
+// CHECK: notifyBlockErased
+// CHECK: notifyOperationInserted: test.valid, was unlinked
+// CHECK: notifyOperationReplaced: test.invalid
+// CHECK: notifyOperationErased: test.invalid
+// CHECK: notifyOperationModified: test.post_order_legalization
+
+// CHECK-LABEL: func @test_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:   ^{{.*}}(%[[arg0:.*]]: f64):
+// Note: The survival of a not-explicitly-invalid operation does *not* cause
+// a conversion failure in when applying a partial conversion.
+//       CHECK:     %[[cast:.*]] = "test.cast"(%[[arg0]]) : (f64) -> i64
+//       CHECK:     "test.remaining_consumer"(%[[cast]]) : (i64) -> ()
+//       CHECK:     "test.valid"(%[[arg0]]) : (f64) -> ()
+//       CHECK:   }) {is_legal} : () -> ()
+func.func @test_preorder_legalization() {
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // expected-remark @+1 {{'test.remaining_consumer' is not legalizable}}
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  // expected-remark @+1 {{'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 12edecc113495..9b64bc691588d 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1418,6 +1418,22 @@ class TestTypeConsumerOpPattern
   }
 };
 
+class TestPostOrderLegalization : public ConversionPattern {
+public:
+  TestPostOrderLegalization(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.post_order_legalization", 1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    for (Region &r : op->getRegions())
+      if (failed(rewriter.legalize(&r)))
+        return failure();
+    rewriter.modifyOpInPlace(
+        op, [&]() { op->setAttr("is_legal", rewriter.getUnitAttr()); });
+    return success();
+  }
+};
+
 /// Test unambiguous overload resolution of replaceOpWithMultiple. This
 /// function is just to trigger compiler errors. It is never executed.
 [[maybe_unused]] void testReplaceOpWithMultipleOverloads(
@@ -1532,7 +1548,8 @@ struct TestLegalizePatternDriver
     patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
                  TestPassthroughInvalidOp, TestMultiple1ToNReplacement,
                  TestValueReplace, TestReplaceWithValidConsumer,
-                 TestTypeConsumerOpPattern>(&getContext(), converter);
+                 TestTypeConsumerOpPattern, TestPostOrderLegalization>(
+        &getContext(), converter);
     patterns.add<TestConvertBlockArgs>(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1560,6 +1577,9 @@ struct TestLegalizePatternDriver
     target.addDynamicallyLegalOp(
         OperationName("test.value_replace", &getContext()),
         [](Operation *op) { return op->hasAttr("is_legal"); });
+    target.addDynamicallyLegalOp(
+        OperationName("test.post_order_legalization", &getContext()),
+        [](Operation *op) { return op->hasAttr("is_legal"); });
 
     // TestCreateUnregisteredOp creates `arith.constant` operation,
     // which was not added to target intentionally to test

From 6c640b86e6e03298385231cb7e77d2f3524bc643 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 5 Nov 2025 21:24:59 +0900
Subject: [PATCH 306/313] [mlir][LLVM] Fix unsupported FP lowering in
 `VectorConvertToLLVMPattern` (#166513)

Fixes a bug in `VectorConvertToLLVMPattern`, which converted operations
with unsupported FP types. E.g., `arith.addf ... : f4E2M1FN` was lowered
to `llvm.fadd ... : i4`, which does not verify. There are a few more
patterns that have the same bug. Those will be fixed in follow-up PRs.

This commit is in preparation of adding an `APFloat`-based lowering for
`arith` operations with unsupported floating-point types.
---
 .../Conversion/LLVMCommon/VectorPattern.h     | 31 +++++++++++++++++++
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir | 26 ++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index 964281592cc65..cad6cec761ab8 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -92,12 +92,43 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
   using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
   using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>;
 
+  /// Return the given type if it's a floating point type. If the given type is
+  /// a vector type, return its element type if it's a floating point type.
+  static FloatType getFloatingPointType(Type type) {
+    if (auto floatType = dyn_cast<FloatType>(type))
+      return floatType;
+    if (auto vecType = dyn_cast<VectorType>(type))
+      return dyn_cast<FloatType>(vecType.getElementType());
+    return nullptr;
+  }
+
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     static_assert(
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");
+
+    // The pattern should not apply if a floating-point operand is converted to
+    // a non-floating-point type. This indicates that the floating point type
+    // is not supported by the LLVM lowering. (Such types are converted to
+    // integers.)
+    auto checkType = [&](Value v) -> LogicalResult {
+      FloatType floatType = getFloatingPointType(v.getType());
+      if (!floatType)
+        return success();
+      Type convertedType = this->getTypeConverter()->convertType(floatType);
+      if (!isa_and_nonnull<FloatType>(convertedType))
+        return rewriter.notifyMatchFailure(op,
+                                           "unsupported floating point type");
+      return success();
+    };
+    for (Value operand : op->getOperands())
+      if (failed(checkType(operand)))
+        return failure();
+    if (failed(checkType(op->getResult(0))))
+      return failure();
+
     // Determine attributes for the target op
     AttrConvert<SourceOp, TargetOp> attrConvert(op);
 
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index ba12ff29ebef9..b5dcb01d3dc6b 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -747,3 +747,29 @@ func.func @memref_bitcast(%1: memref<?xi16>) -> memref<?xbf16> {
   %2 = arith.bitcast %1 : memref<?xi16> to memref<?xbf16>
   func.return %2 : memref<?xbf16>
 }
+
+// -----
+
+// CHECK-LABEL: func @unsupported_fp_type
+//       CHECK:   arith.addf {{.*}} : f4E2M1FN
+//       CHECK:   arith.addf {{.*}} : vector<4xf4E2M1FN>
+//       CHECK:   arith.addf {{.*}} : vector<8x4xf4E2M1FN>
+func.func @unsupported_fp_type(%arg0: f4E2M1FN, %arg1: vector<4xf4E2M1FN>, %arg2: vector<8x4xf4E2M1FN>) -> (f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>) {
+  %0 = arith.addf %arg0, %arg0 : f4E2M1FN
+  %1 = arith.addf %arg1, %arg1 : vector<4xf4E2M1FN>
+  %2 = arith.addf %arg2, %arg2 : vector<8x4xf4E2M1FN>
+  return %0, %1, %2 : f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>
+}
+
+// -----
+
+//   CHECK-LABEL: func @supported_fp_type
+//         CHECK:   llvm.fadd {{.*}} : f32
+//         CHECK:   llvm.fadd {{.*}} : vector<4xf32>
+// CHECK-COUNT-4:   llvm.fadd {{.*}} : vector<8xf32>
+func.func @supported_fp_type(%arg0: f32, %arg1: vector<4xf32>, %arg2: vector<4x8xf32>) -> (f32, vector<4xf32>, vector<4x8xf32>) {
+  %0 = arith.addf %arg0, %arg0 : f32
+  %1 = arith.addf %arg1, %arg1 : vector<4xf32>
+  %2 = arith.addf %arg2, %arg2 : vector<4x8xf32>
+  return %0, %1, %2 : f32, vector<4xf32>, vector<4x8xf32>
+}

From c782ed3440b5a1565428db9731504fd1c4c2a9a9 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@amd.com>
Date: Wed, 5 Nov 2025 13:31:10 +0100
Subject: [PATCH 307/313] [utils][UpdateLLCTestChecks] Add MIR support to
 update_llc_test_checks.py. (#164965)

This change enables update_llc_test_checks.py to automatically generate
MIR checks for RUN lines that use `-stop-before` or `-stop-after` flags
allowing tests to verify intermediate compilation stages (e.g., after
instruction selection but before peephole optimizations) alongside the
final assembly output. If `-debug-only` flag is present in the run line it's
considered as the main point of interest for testing and stop flags above
are ignored (that is no MIR checks are generated).

This resulted from the scenario, when I needed to test two instruction
matching patterns where the later pattern in the peepholer reverts the
earlier pattern in the instruction selector and distinguish it from the
case when the earlier pattern didn't worked at all.

Initially created by Claude Sonnet 4.5 it was improved later to handle
conflicts in MIR <-> ASM prefixes and formatting.
---
 .../Inputs/x86_asm_mir_mixed.ll               | 17 ++++
 .../Inputs/x86_asm_mir_mixed.ll.expected      | 45 +++++++++
 .../Inputs/x86_asm_mir_same_prefix.ll         | 13 +++
 .../x86_asm_mir_same_prefix.ll.expected       | 16 ++++
 .../x86-asm-mir-mixed.test                    |  9 ++
 .../x86-asm-mir-same-prefix.test              |  7 ++
 llvm/utils/UpdateTestChecks/common.py         |  1 +
 llvm/utils/UpdateTestChecks/mir.py            | 11 ++-
 llvm/utils/update_llc_test_checks.py          | 94 +++++++++++++++----
 9 files changed, 194 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test

diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
new file mode 100644
index 0000000000000..292637177591f
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
new file mode 100644
index 0000000000000..88cb03e85204a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+; ASM-LABEL: test1:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movq %rdi, %rax
+; ASM-NEXT:    addq -{{[0-9]+}}(%rsp), %rax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test1
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $rdi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+; MIR-NEXT:   [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc)
+; MIR-NEXT:   $rax = COPY [[ADD64rm]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+; ASM-LABEL: test2:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movl %edi, %eax
+; ASM-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test2
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $edi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc)
+; MIR-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit
+; MIR-NEXT:   $rax = COPY [[SUBREG_TO_REG]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
new file mode 100644
index 0000000000000..7167bcf258e68
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
new file mode 100644
index 0000000000000..1ba920d1de8b0
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
new file mode 100644
index 0000000000000..6fc57b583b37d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
@@ -0,0 +1,9 @@
+# REQUIRES: x86-registered-target
+## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
+
+## Verify that running the script again on an already updated file doesn't add duplicate checks
+# RUN: %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
new file mode 100644
index 0000000000000..bb91a44678f1a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
@@ -0,0 +1,7 @@
+## Test that using the same prefix for both ASM and MIR outputs generates a warning
+## and doesn't produce any checks.
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING
+# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll
+
+# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 2dad16a8eebb7..baa0377cd8b81 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -605,6 +605,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)")
 MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
+STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
 IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
index 24bb8b341d335..01ee0e19f7cb9 100644
--- a/llvm/utils/UpdateTestChecks/mir.py
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -163,13 +163,15 @@ def add_mir_checks_for_function(
     print_fixed_stack,
     first_check_is_next,
     at_the_function_name,
+    check_indent=None,
 ):
     printed_prefixes = set()
     for run in run_list:
         for prefix in run[0]:
             if prefix in printed_prefixes:
                 break
-            if not func_dict[prefix][func_name]:
+            # func_info can be empty if there was a prefix conflict.
+            if not func_dict[prefix].get(func_name):
                 continue
             if printed_prefixes:
                 # Add some space between different check prefixes.
@@ -185,6 +187,7 @@ def add_mir_checks_for_function(
                 func_dict[prefix][func_name],
                 print_fixed_stack,
                 first_check_is_next,
+                check_indent,
             )
             break
         else:
@@ -204,6 +207,7 @@ def add_mir_check_lines(
     func_info,
     print_fixed_stack,
     first_check_is_next,
+    check_indent=None,
 ):
     func_body = str(func_info).splitlines()
     if single_bb:
@@ -220,7 +224,10 @@ def add_mir_check_lines(
     first_line = func_body[0]
     indent = len(first_line) - len(first_line.lstrip(" "))
     # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
+    if check_indent is not None:
+        check = "{}; {}".format(check_indent, prefix)
+    else:
+        check = "{:>{}}; {}".format("", indent, prefix)
 
     output_lines.append("{}-LABEL: name: {}".format(check, func_name))
 
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 8c57e75f34f75..98864be62875b 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -15,7 +15,7 @@
 import os  # Used to advertise this file's name ("autogenerated_note").
 import sys
 
-from UpdateTestChecks import common
+from UpdateTestChecks import common, mir
 
 # llc is the only llc-like in the LLVM tree but downstream forks can add
 # additional ones here if they have them.
@@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo):
             break
 
     run_list = []
+    mir_run_list = []
     for l in ti.run_lines:
         if "|" not in l:
             common.warn("Skipping unparsable RUN line: " + l)
@@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo):
         if m:
             march_in_cmd = m.groups()[0]
 
+        target_list = run_list
         m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd)
         if m and m.groups()[0] == "isel":
             from UpdateTestChecks import isel as output_type
+        elif not m and common.STOP_PASS_RE.search(llc_cmd):
+            # MIR output mode. If -debug-only is present assume
+            # the debug output is the main point of interest.
+            target_list = mir_run_list
         else:
             from UpdateTestChecks import asm as output_type
 
@@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo):
 
         # FIXME: We should use multiple check prefixes to common check lines. For
         # now, we just ignore all but the last.
-        run_list.append(
+        target_list.append(
             (
                 check_prefixes,
                 llc_tool,
@@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo):
         ginfo=ginfo,
     )
 
-    for (
-        prefixes,
-        llc_tool,
-        llc_args,
-        preprocess_cmd,
-        triple_in_cmd,
-        march_in_cmd,
-    ) in run_list:
+    # Dictionary to store MIR function bodies separately
+    mir_func_dict = {}
+    for run_tuple, is_mir in [(run, False) for run in run_list] + [
+        (run, True) for run in mir_run_list
+    ]:
+        (
+            prefixes,
+            llc_tool,
+            llc_args,
+            preprocess_cmd,
+            triple_in_cmd,
+            march_in_cmd,
+        ) = run_tuple
+
         common.debug("Extracted LLC cmd:", llc_tool, llc_args)
         common.debug("Extracted FileCheck prefixes:", str(prefixes))
 
@@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo):
         if not triple:
             triple = common.get_triple_from_march(march_in_cmd)
 
-        scrubber, function_re = output_type.get_run_handler(triple)
-        if 0 == builder.process_run_line(
-            function_re, scrubber, raw_tool_output, prefixes
-        ):
-            common.warn(
-                "Couldn't match any function. Possibly the wrong target triple has been provided"
+        if is_mir:
+            # MIR output mode
+            common.debug("Detected MIR output mode for prefixes:", str(prefixes))
+            for prefix in prefixes:
+                if prefix not in mir_func_dict:
+                    mir_func_dict[prefix] = {}
+
+            mir.build_function_info_dictionary(
+                ti.path,
+                raw_tool_output,
+                triple,
+                prefixes,
+                mir_func_dict,
+                ti.args.verbose,
             )
-        builder.processed_prefixes(prefixes)
+        else:
+            # ASM output mode
+            scrubber, function_re = output_type.get_run_handler(triple)
+            if 0 == builder.process_run_line(
+                function_re, scrubber, raw_tool_output, prefixes
+            ):
+                common.warn(
+                    "Couldn't match any function. Possibly the wrong target triple has been provided"
+                )
+            builder.processed_prefixes(prefixes)
 
     func_dict = builder.finish_and_get_func_dict()
+
+    # Check for conflicts: same prefix used for both ASM and MIR
+    conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys())
+    if conflicting_prefixes:
+        common.warn(
+            "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format(
+                ", ".join(sorted(conflicting_prefixes))
+            ),
+            test_file=ti.path,
+        )
+        for prefix in conflicting_prefixes:
+            mir_func_dict[prefix] = {}
+            func_dict[prefix] = {}
+
     global_vars_seen_dict = {}
 
     is_in_function = False
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
+    prefix_set.update([prefix for p in mir_run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
     output_lines = []
 
@@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo):
                         is_filtered=builder.is_filtered(),
                     )
                 )
+
+                # Also add MIR checks if we have them for this function
+                if mir_run_list and func_name:
+                    mir.add_mir_checks_for_function(
+                        ti.path,
+                        output_lines,
+                        mir_run_list,
+                        mir_func_dict,
+                        func_name,
+                        single_bb=False,  # Don't skip basic block labels.
+                        print_fixed_stack=False,  # Don't print fixed stack (ASM tests don't need it).
+                        first_check_is_next=False,  # First check is LABEL, not NEXT.
+                        at_the_function_name=False,  # Use "name:" not "@name".
+                        check_indent="",  # No indentation for IR files (not MIR files).
+                    )
+
                 is_in_function_start = False
 
             if is_in_function:

From a4105707eeaf53c13c2f09298d762995267c7717 Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Wed, 5 Nov 2025 20:36:38 +0800
Subject: [PATCH 308/313] [llvm][LoongArch] Introduce LASX and LSX conversion
 intrinsics (#157818)

This patch introduces the LASX and LSX conversion intrinsics:

- <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
- <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
- <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
- <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x
float>)
- <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x
double>)
- <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
- <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
- <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
- <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
- <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
- <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
- <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
- <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x
float>)
- <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x
double>)
- <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
- <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x
float>)
- <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x
double>)
- <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
---
 llvm/include/llvm/IR/IntrinsicsLoongArch.td   |  38 +++
 .../LoongArch/LoongArchISelLowering.cpp       |   5 +
 .../LoongArch/LoongArchLASXInstrInfo.td       |  31 ++
 .../LoongArch/lasx/intrinsic-conversion.ll    | 303 ++++++++++++++++++
 4 files changed, 377 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 84026aa9d3624..1c46965d995fe 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w
 def int_loongarch_lasx_xvstelm_d
   : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
+// LASX and LSX conversion
+def int_loongarch_lasx_cast_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fe700e17d341b..cf4ffc82f6009 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
                          N->getOperand(1));
     break;
+  case Intrinsic::loongarch_lasx_concat_128_s:
+  case Intrinsic::loongarch_lasx_concat_128_d:
+  case Intrinsic::loongarch_lasx_concat_128:
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index b502b056c4cdf..00d52870f1727 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64,  2,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8,  16, sub_128>;
 
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
new file mode 100644
index 0000000000000..006713ccabf47
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
+
+define void @lasx_cast_128_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a)
+  store <8 x float> %b, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
+
+define void @lasx_cast_128_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a)
+  store <4 x double> %b, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
+
+define void @lasx_cast_128(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a)
+  store <4 x i64> %b, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
+
+define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
+
+define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
+
+define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
+
+define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
+
+define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
+
+define void @lasx_extract_128_lo(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
+
+define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
+
+define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
+
+define void @lasx_extract_128_hi(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}

From 7844829184f16feaf5ba6f8fbb23308abd03a0ba Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadribasak@google.com>
Date: Wed, 5 Nov 2025 12:38:30 +0000
Subject: [PATCH 309/313] Adding the missing origin count logic to
 AnalysisBasedWarnings

---
 .../Analyses/LifetimeSafety/LifetimeSafety.h  | 16 ++++------
 .../Analyses/LifetimeSafety/Origins.h         |  7 +++--
 .../clang/Sema/AnalysisBasedWarnings.h        |  9 ++++++
 .../LifetimeSafety/LifetimeSafety.cpp         | 27 +++++------------
 clang/lib/Analysis/LifetimeSafety/Origins.cpp | 16 +++++++---
 clang/lib/Sema/AnalysisBasedWarnings.cpp      | 30 +++++++++++++++++--
 6 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 7490df90a3282..eb532bc8be3a7 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -48,10 +48,6 @@ class LifetimeSafetyReporter {
                                   Confidence Confidence) {}
 };
 
-/// The main entry point for the analysis.
-void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
-                               LifetimeSafetyReporter *Reporter);
-
 namespace internal {
 /// An object to hold the factories for immutable collections, ensuring
 /// that all created states share the same underlying memory management.
@@ -64,8 +60,6 @@ struct LifetimeFactory {
 /// Running the lifetime safety analysis and querying its results. It
 /// encapsulates the various dataflow analyses.
 class LifetimeSafetyAnalysis {
-private:
-  static llvm::StringMap<int> MissingOriginCount;
 
 public:
   LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
@@ -80,10 +74,6 @@ class LifetimeSafetyAnalysis {
   LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; }
   FactManager &getFactManager() { return FactMgr; }
 
-  static void PrintStats(llvm::raw_ostream &OS);
-
-  static void UpdateMissingOriginCount(const OriginManager &OM);
-
 private:
   AnalysisDeclContext &AC;
   LifetimeSafetyReporter *Reporter;
@@ -93,6 +83,12 @@ class LifetimeSafetyAnalysis {
   std::unique_ptr<LoanPropagationAnalysis> LoanPropagation;
 };
 } // namespace internal
+
+/// The main entry point for the analysis.
+std::unique_ptr<internal::LifetimeSafetyAnalysis>
+runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
+                          LifetimeSafetyReporter *Reporter);
+
 } // namespace clang::lifetimes
 
 #endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index 231cc60b7e097..79a345a591adc 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -79,7 +79,10 @@ class OriginManager {
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
-  const llvm::StringMap<int> getMissingOrigins() const;
+  const llvm::StringMap<unsigned> getMissingOrigins() const;
+
+  // Utility function to check if an origin is missing for a given expression.
+  bool isOriginMissing(const Expr &E) const;
 
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
@@ -90,7 +93,7 @@ class OriginManager {
   llvm::SmallVector<Origin> AllOrigins;
   llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID;
   llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID;
-  llvm::StringMap<int> ExprTypeToMissingOriginCount;
+  llvm::StringMap<unsigned> ExprTypeToMissingOriginCount;
 };
 } // namespace clang::lifetimes::internal
 
diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
index 4103c3f006a8f..604039ef61cb7 100644
--- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
+++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
@@ -14,7 +14,10 @@
 #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H
 
 #include "clang/AST/Decl.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
+#include "clang/Analysis/AnalysisDeclContext.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include <memory>
 
 namespace clang {
@@ -95,6 +98,9 @@ class AnalysisBasedWarnings {
   /// a single function.
   unsigned MaxUninitAnalysisBlockVisitsPerFunction;
 
+  /// Map from expressions missing origin in OriginManager to their counts.
+  llvm::StringMap<unsigned> MissingOriginCount;
+
   /// @}
 
 public:
@@ -116,6 +122,9 @@ class AnalysisBasedWarnings {
   Policy &getPolicyOverrides() { return PolicyOverrides; }
 
   void PrintStats() const;
+
+  void FindMissingOrigins(AnalysisDeclContext &AC,
+                          clang::lifetimes::internal::FactManager &FactMgr);
 };
 
 } // namespace sema
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 828c08d1cbeed..d183ce976f946 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -33,25 +33,10 @@
 namespace clang::lifetimes {
 namespace internal {
 
-llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount;
-
 LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
 
-void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) {
-    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
-                    "(expression_type : count) :\n";
-    for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) {
-      OS << expr << " : " << count << '\n';
-    }
-  }
-
-void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) {
-    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
-      LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count;
-    }
-  }
 void LifetimeSafetyAnalysis::run() {
   llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
 
@@ -83,13 +68,15 @@ void LifetimeSafetyAnalysis::run() {
                   LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints()));
 
   runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter);
-  UpdateMissingOriginCount(FactMgr.getOriginMgr());
 }
 } // namespace internal
 
-void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
-                               LifetimeSafetyReporter *Reporter) {
-  internal::LifetimeSafetyAnalysis Analysis(AC, Reporter);
-  Analysis.run();
+std::unique_ptr<internal::LifetimeSafetyAnalysis>
+runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
+                          LifetimeSafetyReporter *Reporter) {
+  std::unique_ptr<internal::LifetimeSafetyAnalysis> Analysis =
+      std::make_unique<internal::LifetimeSafetyAnalysis>(AC, Reporter);
+  Analysis->run();
+  return Analysis;
 }
 } // namespace clang::lifetimes
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index abe067a829cb7..9e314d984b74f 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/TypeBase.h"
 #include "llvm/ADT/StringMap.h"
 
@@ -24,7 +25,7 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const {
   OS << ")";
 }
 
-const llvm::StringMap<int> OriginManager::getMissingOrigins() const {
+const llvm::StringMap<unsigned> OriginManager::getMissingOrigins() const {
   return ExprTypeToMissingOriginCount;
 }
 
@@ -46,10 +47,12 @@ OriginID OriginManager::get(const Expr &E) {
 
   // if the expression has no specific origin, increment the missing origin
   // counter.
-  const QualType ExprType = E.getType();
-  auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString());
+  // const QualType ExprType = E.getType();
+  std::string ExprStr(E.getStmtClassName());
+  ExprStr = ExprStr + "<" + E.getType().getAsString() + ">";
+  auto CountIt = ExprTypeToMissingOriginCount.find(ExprStr);
   if (CountIt == ExprTypeToMissingOriginCount.end()) {
-    ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1;
+    ExprTypeToMissingOriginCount[ExprStr] = 1;
   } else {
     CountIt->second++;
   }
@@ -102,4 +105,9 @@ OriginID OriginManager::getOrCreate(const ValueDecl &D) {
   return NewID;
 }
 
+bool OriginManager::isOriginMissing(const Expr &E) const {
+  auto It = ExprToOriginID.find(&E);
+  return It == ExprToOriginID.end();
+}
+
 } // namespace clang::lifetimes::internal
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 009994e189220..157d63b06137d 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -29,8 +29,10 @@
 #include "clang/Analysis/Analyses/CFGReachabilityAnalysis.h"
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/Analysis/Analyses/PostOrderCFGView.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/UninitializedValues.h"
@@ -53,6 +55,7 @@
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -3067,7 +3070,11 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) {
     if (AC.getCFG()) {
       lifetimes::LifetimeSafetyReporterImpl LifetimeSafetyReporter(S);
-      lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter);
+      std::unique_ptr<clang::lifetimes::internal::LifetimeSafetyAnalysis>
+          Analysis =
+              lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter);
+      if (S.CollectStats)
+        FindMissingOrigins(AC, Analysis->getFactManager());
     }
   }
   // Check for violations of "called once" parameter properties.
@@ -3133,8 +3140,27 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   }
 }
 
+void clang::sema::AnalysisBasedWarnings::FindMissingOrigins(
+    AnalysisDeclContext &AC, lifetimes::internal::FactManager &FactMgr) {
+  if (AC.getCFG()) {
+    for (const auto &[expr, count] :
+         FactMgr.getOriginMgr().getMissingOrigins()) {
+      MissingOriginCount[expr] += count;
+    }
+  }
+}
+
 void clang::sema::AnalysisBasedWarnings::PrintStats() const {
-  clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs());
+  // clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs());
+  llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
+                  "(expression_type : count) :\n";
+  unsigned totalMissingOrigins = 0;
+  for (const auto &[expr, count] : MissingOriginCount) {
+    llvm::errs() << expr << " : " << count << '\n';
+    totalMissingOrigins += count;
+  }
+  llvm::errs() << "Total missing origins: " << totalMissingOrigins << "\n";
+  llvm::errs() << "****************************************\n";
   llvm::errs() << "\n*** Analysis Based Warnings Stats:\n";
   unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs;
   unsigned AvgCFGBlocksPerFunction =

From 5b478d13bba0960fcf2eab472381e097638580dd Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadribasak@google.com>
Date: Mon, 3 Nov 2025 13:11:00 +0000
Subject: [PATCH 310/313] Adding implementation for missing origin statsitics
 on LifetimeSafetyAnalysis

---
 .../Analyses/LifetimeSafety/LifetimeSafety.h  | 21 +++++++++++++++++++
 .../Analyses/LifetimeSafety/Origins.h         |  6 ++++++
 .../LifetimeSafety/LifetimeSafety.cpp         |  5 +++++
 clang/lib/Analysis/LifetimeSafety/Origins.cpp | 16 ++++++++++++++
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  4 +++-
 5 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 91ffbb169f947..4952d84a80369 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -23,7 +23,11 @@
 #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
 
 namespace clang::lifetimes {
 
@@ -60,6 +64,9 @@ struct LifetimeFactory {
 /// Running the lifetime safety analysis and querying its results. It
 /// encapsulates the various dataflow analyses.
 class LifetimeSafetyAnalysis {
+private:
+  static llvm::StringMap<int> MissingOriginCount;
+
 public:
   LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                          LifetimeSafetyReporter *Reporter);
@@ -73,6 +80,20 @@ class LifetimeSafetyAnalysis {
   LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; }
   FactManager &getFactManager() { return FactMgr; }
 
+  static void PrintStats(llvm::raw_ostream &OS) {
+    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
+                    "(expression_type : count) :\n";
+    for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) {
+      OS << expr << " : " << count << '\n';
+    }
+  }
+
+  static void UpdateMissingOriginCount(const OriginManager &OM) {
+    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
+      LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count;
+    }
+  }
+
 private:
   AnalysisDeclContext &AC;
   LifetimeSafetyReporter *Reporter;
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index ba138b078b379..231cc60b7e097 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -16,7 +16,10 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::lifetimes::internal {
 
@@ -76,6 +79,8 @@ class OriginManager {
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
+  const llvm::StringMap<int> getMissingOrigins() const;
+
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
 
@@ -85,6 +90,7 @@ class OriginManager {
   llvm::SmallVector<Origin> AllOrigins;
   llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID;
   llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID;
+  llvm::StringMap<int> ExprTypeToMissingOriginCount;
 };
 } // namespace clang::lifetimes::internal
 
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 00c7ed90503e7..a76fdd2535d97 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -23,14 +23,18 @@
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace clang::lifetimes {
 namespace internal {
 
+llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount;
+
 LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
@@ -66,6 +70,7 @@ void LifetimeSafetyAnalysis::run() {
                   LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints()));
 
   runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter);
+  UpdateMissingOriginCount(FactMgr.getOriginMgr());
 }
 } // namespace internal
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index ea51a75324e06..abe067a829cb7 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/AST/TypeBase.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace clang::lifetimes::internal {
 
@@ -22,6 +24,10 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const {
   OS << ")";
 }
 
+const llvm::StringMap<int> OriginManager::getMissingOrigins() const {
+  return ExprTypeToMissingOriginCount;
+}
+
 Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) {
   AllOrigins.emplace_back(ID, &D);
   return AllOrigins.back();
@@ -37,6 +43,16 @@ OriginID OriginManager::get(const Expr &E) {
   auto It = ExprToOriginID.find(&E);
   if (It != ExprToOriginID.end())
     return It->second;
+
+  // if the expression has no specific origin, increment the missing origin
+  // counter.
+  const QualType ExprType = E.getType();
+  auto CountIt = ExprTypeToMissingOriginCount.find(ExprType.getAsString());
+  if (CountIt == ExprTypeToMissingOriginCount.end()) {
+    ExprTypeToMissingOriginCount[ExprType.getAsString()] = 1;
+  } else {
+    CountIt->second++;
+  }
   // If the expression itself has no specific origin, and it's a reference
   // to a declaration, its origin is that of the declaration it refers to.
   // For pointer types, where we don't pre-emptively create an origin for the
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 140b709dbb651..009994e189220 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -30,6 +30,7 @@
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/UninitializedValues.h"
@@ -53,6 +54,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <deque>
 #include <iterator>
@@ -3132,8 +3134,8 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
 }
 
 void clang::sema::AnalysisBasedWarnings::PrintStats() const {
+  clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs());
   llvm::errs() << "\n*** Analysis Based Warnings Stats:\n";
-
   unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs;
   unsigned AvgCFGBlocksPerFunction =
       !NumCFGsBuilt ? 0 : NumCFGBlocks/NumCFGsBuilt;

From 20bc6f2c9b89b971de90ac1bd695ddb4659442e5 Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadribasak@google.com>
Date: Mon, 3 Nov 2025 13:23:42 +0000
Subject: [PATCH 311/313] Minor refactoring of the static functions

---
 .../Analyses/LifetimeSafety/LifetimeSafety.h       | 14 ++------------
 .../lib/Analysis/LifetimeSafety/LifetimeSafety.cpp | 13 +++++++++++++
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 4952d84a80369..7490df90a3282 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -80,19 +80,9 @@ class LifetimeSafetyAnalysis {
   LiveOriginsAnalysis &getLiveOrigins() const { return *LiveOrigins; }
   FactManager &getFactManager() { return FactMgr; }
 
-  static void PrintStats(llvm::raw_ostream &OS) {
-    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
-                    "(expression_type : count) :\n";
-    for (const auto &[expr, count] : LifetimeSafetyAnalysis::count) {
-      OS << expr << " : " << count << '\n';
-    }
-  }
+  static void PrintStats(llvm::raw_ostream &OS);
 
-  static void UpdateMissingOriginCount(const OriginManager &OM) {
-    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
-      LifetimeSafetyAnalysis::count[std::string(expr)] += missing_origin_count;
-    }
-  }
+  static void UpdateMissingOriginCount(const OriginManager &OM);
 
 private:
   AnalysisDeclContext &AC;
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index a76fdd2535d97..828c08d1cbeed 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -39,6 +39,19 @@ LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
 
+void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) {
+    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
+                    "(expression_type : count) :\n";
+    for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) {
+      OS << expr << " : " << count << '\n';
+    }
+  }
+
+void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) {
+    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
+      LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count;
+    }
+  }
 void LifetimeSafetyAnalysis::run() {
   llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
 

From 41d835c613f521295f3a59bc06bb7c21d0c80595 Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadribasak@google.com>
Date: Wed, 5 Nov 2025 13:09:03 +0000
Subject: [PATCH 312/313] Correcting merging errors

---
 .../Analysis/LifetimeSafety/LifetimeSafety.cpp    | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 1e7d8f976c33f..744cf7537f8cb 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -33,25 +33,10 @@
 namespace clang::lifetimes {
 namespace internal {
 
-llvm::StringMap<int> LifetimeSafetyAnalysis::MissingOriginCount;
-
 LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                                                LifetimeSafetyReporter *Reporter)
     : AC(AC), Reporter(Reporter) {}
 
-void LifetimeSafetyAnalysis::PrintStats(llvm::raw_ostream &OS) {
-    llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
-                    "(expression_type : count) :\n";
-    for (const auto &[expr, count] : LifetimeSafetyAnalysis::MissingOriginCount) {
-      OS << expr << " : " << count << '\n';
-    }
-  }
-
-void LifetimeSafetyAnalysis::UpdateMissingOriginCount(const OriginManager &OM) {
-    for (const auto &[expr, missing_origin_count] : OM.getMissingOrigins()) {
-      LifetimeSafetyAnalysis::MissingOriginCount[std::string(expr)] += missing_origin_count;
-    }
-  }
 void LifetimeSafetyAnalysis::run() {
   llvm::TimeTraceScope TimeProfile("LifetimeSafetyAnalysis");
 

From 5efc02004d67debbf75f4a9bd06a41bdf4488c32 Mon Sep 17 00:00:00 2001
From: Debadri Basak <debadri1010@gmail.com>
Date: Wed, 5 Nov 2025 13:24:22 +0000
Subject: [PATCH 313/313] Removing the obsolete static functions from
 LifetimeSafetyAnalysis

---
 .../include/clang/Analysis/Analyses/LifetimeSafety/Origins.h | 2 +-
 clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp         | 1 -
 clang/lib/Analysis/LifetimeSafety/Origins.cpp                | 5 -----
 3 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index 3f8c8a4d7ce9b..26686a63e9204 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -79,7 +79,7 @@ class OriginManager {
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
-  const llvm::StringMap<int> getMissingOrigins() const;
+  const llvm::StringMap<unsigned> getMissingOrigins() const;
 
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 744cf7537f8cb..d183ce976f946 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -68,7 +68,6 @@ void LifetimeSafetyAnalysis::run() {
                   LiveOrigins->dump(llvm::dbgs(), FactMgr.getTestPoints()));
 
   runLifetimeChecker(*LoanPropagation, *LiveOrigins, FactMgr, AC, Reporter);
-  UpdateMissingOriginCount(FactMgr.getOriginMgr());
 }
 } // namespace internal
 
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index 9e314d984b74f..b8f705e3377c2 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -105,9 +105,4 @@ OriginID OriginManager::getOrCreate(const ValueDecl &D) {
   return NewID;
 }
 
-bool OriginManager::isOriginMissing(const Expr &E) const {
-  auto It = ExprToOriginID.find(&E);
-  return It == ExprToOriginID.end();
-}
-
 } // namespace clang::lifetimes::internal